From 86e018dbe2040b2453beea27d1efe3d5e6f47eed Mon Sep 17 00:00:00 2001 From: beform88 <2298266722@qq.com> Date: Tue, 16 Sep 2025 13:25:44 +0000 Subject: [PATCH 1/2] modified: evaluate/base/evaluation.py modified: evaluate/base/human_simulator.py modified: evaluate/experiments/threads/database_search/database_search.json new file: evaluate/experiments/threads/database_search/database_search_bash.py new file: evaluate/experiments/threads/database_search/run.sh new file: evaluate/experiments/threads/structure_generate/run.sh modified: evaluate/experiments/threads/structure_generate/structure_generate.json new file: evaluate/experiments/threads/structure_generate/structure_generate_bash.py --- evaluate/base/evaluation.py | 174 ++++++++++++++++++ evaluate/base/human_simulator.py | 18 +- .../database_search/database_search.json | 132 +++++-------- .../database_search/database_search_bash.py | 14 ++ .../threads/database_search/run.sh | 39 ++++ .../threads/structure_generate/run.sh | 39 ++++ .../structure_generate.json | 13 -- .../structure_generate_bash.py | 14 ++ 8 files changed, 341 insertions(+), 102 deletions(-) create mode 100644 evaluate/experiments/threads/database_search/database_search_bash.py create mode 100755 evaluate/experiments/threads/database_search/run.sh create mode 100755 evaluate/experiments/threads/structure_generate/run.sh create mode 100644 evaluate/experiments/threads/structure_generate/structure_generate_bash.py diff --git a/evaluate/base/evaluation.py b/evaluate/base/evaluation.py index 48796a28..a6b0f571 100644 --- a/evaluate/base/evaluation.py +++ b/evaluate/base/evaluation.py @@ -287,3 +287,177 @@ async def evaluation_threads_task(file_path, max_turn_count=10): print('\n' + '=' * 80) print('🎉 多轮对话测试完成!') print('=' * 80) + + +async def evaluation_threads_single_task(file_path, item_id, max_turn_count=10): + """与ADK agent进行多轮对话测试""" + print('=' * 80) + print('🤖 与ADK Agent多轮对话测试') + print('=' * 80) + + dataset_json = json.loads(load_dataset_json(file_path)) + eval_results = {} + dataset_item = dataset_json[item_id] + + time.sleep(10) # 避免请求过于频繁 + session_service = InMemorySessionService() + session = await session_service.create_session( + app_name='matmaster_agent', + user_id='human_simulator_test', + ) + + logger.info(f"Test Session: {session.id}") + + runner = Runner( + app_name='matmaster_agent', + agent=root_agent, + session_service=session_service + ) + + # 创建人类模拟器 + simulator = HumanSimulator(max_turn_count=max_turn_count) + + # 数据预处理 + scenario = { + 'name': dataset_item['initial_question'], + 'goal': ConversationGoal( + initial_question=dataset_item['initial_question'], + expected_outcomes=dataset_item['expected_outcomes'], + success_criteria=dataset_item['success_criteria'] + )} + + print(f"\n{'=' * 20} 测试场景: {scenario['name']} {'=' * 20}") + + # 设置对话目标 + simulator.set_goal(scenario['goal']) + initial_question = simulator.get_initial_question() + + print(f"🎯 对话目标: {initial_question}") + print(f"📋 期望结果: {', '.join(scenario['goal'].expected_outcomes)}") + print(f"✅ 成功标准: {', '.join(scenario['goal'].success_criteria)}") + + # 初始化记录 + eval_results['initial_question'] = initial_question + eval_results['expected_outcomes'] = scenario['goal'].expected_outcomes + eval_results['success_criteria'] = scenario['goal'].success_criteria + for i in range(1, max_turn_count+1): + eval_results[f'agent_response_{i}'] = '' + eval_results[f'user_response_{i}'] = '' + + # 开始对话 + conversation_ended = False + turn_count = 0 + + while not conversation_ended and turn_count < max_turn_count: + turn_count += 1 + print(f"\n🔄 第 {turn_count} 轮对话:") + + # 获取用户输入(从模拟器) + if turn_count == 1: + user_input = initial_question + else: + # 从模拟器获取响应 + user_input = simulator.get_last_user_response() + + print(f"🧑 模拟用户: {user_input}") + + # 调用ADK agent + try: + content = types.Content(role='user', parts=[types.Part(text=user_input)]) + + agent_response = '' + + events = runner.run_async( + user_id=session.user_id, + session_id=session.id, + new_message=content, + run_config=RunConfig(streaming_mode=StreamingMode.SSE) + ) + + # 收集agent响应 + async for event in events: + if event.content and event.content.parts: + for part in event.content.parts: + if part.text: + agent_response += part.text + except asyncio.CancelledError: + logger.error('任务被取消,可能是超时或作用域取消导致') + print(f"✅ 对话在第{turn_count}轮结束") + eval_results[f'agent_response_{turn_count}'] = '任务被取消,可能是超时或作用域取消导致' + break + except Exception as e: + logger.error(f"获取agent响应失败: {e}") + print(f"✅ 对话在第{turn_count}轮结束") + eval_results[f'agent_response_{turn_count}'] = str(e) + break + + eval_results[f'agent_response_{turn_count}'] = agent_response + print(f"🤖 ADK Agent: {agent_response}") + + job_jsons = re.findall(r'(.*?)', agent_response) + job_ids = [] + if job_jsons: + for job_json in job_jsons: + try: + job_json = json.loads(job_json) + if 'eventData' in job_json and 'content' in job_json['eventData']: + content = job_json['eventData']['content'] + if 'job_list' in content and 'job_id' in content['job_list']: + job_id = content['job_list']['job_id'] + job_ids.append(job_id) + except Exception as e: + logger.error(f"提取job_id失败: {e}") + + # 查询job状态 + if job_ids: + job_ids = list(set(job_ids)) + while True: + time.sleep(10) + all_finished = True + for job_id in job_ids: + bohrium_client = Bohrium() + job_info = bohrium_client.job.detail(job_id) + logger.info(f"查询到job状态: {job_id} - 状态: {job_info["status"]}") + if job_info['status'] not in [-1, 2]: + all_finished = False + if all_finished: + break + + # 使用模拟器生成用户响应 + user_response, should_continue = simulator.get_bohr_results(agent_response, job_ids) + eval_results[f'user_response_{turn_count}'] = user_response + print(f"🧑 模拟用户: {user_response}") + else: + # 使用模拟器生成用户响应 + user_response, should_continue = simulator.generate_response(agent_response) + eval_results[f'user_response_{turn_count}'] = user_response + print(f"🧑 模拟用户: {user_response}") + + if not should_continue: + print(f"✅ 对话在第{turn_count}轮结束") + break + + # 获取对话摘要 + summary = simulator.get_conversation_summary() + eval_results['total_turns'] = summary['total_turns'] + eval_results['final_state'] = summary['final_state'] + eval_results['duration_minutes'] = summary['duration_minutes'] + print(f"\n📊 对话摘要:") + print(f" - 总轮次: {summary['total_turns']}") + print(f" - 最终状态: {summary['final_state']}") + print(f" - 耗时: {summary['duration_minutes']:.1f} 分钟") + + with open('evaluation_results.json', 'a') as f: + json.dump(eval_results, f, indent=4, ensure_ascii=False) + + # 简单的成功判断 + if summary['final_state'] == 'satisfied': + print('✅ 测试通过: 对话成功完成') + else: + print('❌ 测试失败: 对话未成功完成') + + await runner.close() + + print('\n' + '=' * 80) + print('🎉 多轮对话测试完成!') + print('=' * 80) diff --git a/evaluate/base/human_simulator.py b/evaluate/base/human_simulator.py index 07fa3d1a..ccb58461 100644 --- a/evaluate/base/human_simulator.py +++ b/evaluate/base/human_simulator.py @@ -142,20 +142,22 @@ def _build_response_prompt(self, agent_message: str) -> str: Agent最新回复: {agent_message} -请分析agent的回复是否满足任务需求,并生成合适的响应。 +请分析agent的回复是否满足 初始问题 需求。如果回复大致符合初始任务要求,请结束对话。如果不符合,请分析不符合的点在哪儿,并生成简洁的用户回复,继续引导agent完成任务。 重要限制: -- 对话最多{self.max_turn_count}轮,当前是第{self.turn_count}轮 -- 除首轮对话外,其他轮次尽可能简短地回答agent的问题,回复内容紧扣初始问题,禁止发散 -- 如果agent在询问具体参数或设置,提供简洁明确的回答 -- 如果agent已经提供了初始任务所需的信息或完成了任务,请立刻结束对话 -- 禁止回复可能导致agent产生误解或偏离目标的内容 - -请以JSON格式回复: +- 对话最多{self.max_turn_count}轮,当前是第{self.turn_count}轮; +- 尽可能简短地回答agent的问题,回复内容紧扣初始问题,禁止发散,避免执行轮数超出限制; +- 如果agent在询问具体参数或设置,提供简洁明确的回答; +- 如果agent明确指出当前任务无法完成,请礼貌地结束对话; +- 如果agent已经提供了初始任务所需的信息或完成了任务,请立刻结束对话; +- agent仅能以文本形式回复,禁止要求agent提供可视化结果; + +请以如下JSON格式回复: {{ "response": "你的回复内容", "continue": true/false // 是否继续对话 }} + """ def get_bohr_results(self, agent_message: str, job_id: List[str]) -> Tuple[str, bool]: diff --git a/evaluate/experiments/threads/database_search/database_search.json b/evaluate/experiments/threads/database_search/database_search.json index 32fbe34a..d84d9840 100644 --- a/evaluate/experiments/threads/database_search/database_search.json +++ b/evaluate/experiments/threads/database_search/database_search.json @@ -1,47 +1,41 @@ [ { "expected_outcomes": [ - "\u627e\u5230 COD \u4e2d\u5e26\u9699 >2 eV \u7684 direct \u6c27\u5316\u7269", - "\u83b7\u5f97\u7ed3\u6784\u4fe1\u606f", - "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6" + "\u627e\u5230 Materials Project \u4e2d\u5e26\u9699 >2 eV \u7684 \u6c27\u5316\u7269" ], - "initial_question": "\u4ece COD \u4e2d\u641c\u7d22\u5e76\u8fd4\u56de\u4e09\u4e2a\u5e26\u9699\u5927\u4e8e 2 eV \u4e14\u4e3a direct \u7c7b\u578b\u7684\u6c27\u5316\u7269\u7ed3\u6784", + "initial_question": "\u5728 Materials Project \u4e2d\u68c0\u7d22\u5e76\u8fd4\u56de3\u4e2a\u5e26\u9699\u5927\u4e8e 2 eV \u7684\u6c27\u5316\u7269\u7ed3\u6784", "success_criteria": [ "\u6240\u6709\u7ed3\u679c\u5747\u4e3a\u6c27\u5316\u7269", - "\u5e26\u9699 >2 eV", - "\u5e26\u9699\u7c7b\u578b\u4e3a direct", - "CIF \u6587\u4ef6\u5b8c\u6574\u5bfc\u51fa", + "\u8fd4\u56de\u6587\u4ef6URL", "\u7ed3\u679c\u6570\u91cf\u6b63\u786e" ] }, { "expected_outcomes": [ "\u5728 Alexandria \u4e2d\u68c0\u7d22", - "\u627e\u5230\u6ee1\u8db3\u5143\u7d20\u548c\u5e26\u9699\u8303\u56f4\u7684\u6750\u6599", + "\u627e\u5230\u6ee1\u8db3\u5143\u7d20\u7684\u6750\u6599", "\u83b7\u5f97\u5bf9\u5e94\u7684\u6750\u6599\u4fe1\u606f", - "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6" + "\u6210\u529f\u8fd4\u56de\u5bf9\u5e94\u6587\u4ef6URL" ], - "initial_question": "\u5728 Alexandria \u4e2d\u67e5\u627e\u5e76\u8fd4\u56de 5 \u4e2a\u542b Li\u3001Mn\u3001O \u4e14\u5e26\u9699\u4f4d\u4e8e 2\u20133 eV \u8303\u56f4\u7684\u6750\u6599", + "initial_question": "\u5728 Alexandria \u4e2d\u67e5\u627e\u5e76\u8fd4\u56de 5 \u4e2a\u542b Li\u3001Mn\u3001O \u7684\u6750\u6599", "success_criteria": [ "\u6240\u6709\u7ed3\u679c\u5747\u6765\u81eaAlexandria", "\u6240\u6709\u7ed3\u679c\u5747\u542b Li\u3001Mn\u3001O", - "\u5e26\u9699\u5728 2\u20133 eV \u5185", - "CIF \u6587\u4ef6\u5bfc\u51fa\u5b8c\u6574", + "\u6587\u4ef6 URL \u5b8c\u6574", "\u7ed3\u679c\u6570\u91cf\u6b63\u786e" ] }, { "expected_outcomes": [ - "\u5728 Alexandria \u4e2d\u68c0\u7d22", - "\u627e\u5230\u7b26\u5408\u5143\u7d20\u548c\u7a7a\u95f4\u7fa4\u6761\u4ef6\u7684\u5c0f\u5e26\u9699\u5316\u5408\u7269", - "\u5bfc\u51fa CIF \u6587\u4ef6" + "\u5728 Materials Project \u4e2d\u68c0\u7d22", + "\u627e\u5230\u7b26\u5408\u5143\u7d20\u548c\u7a7a\u95f4\u7fa4\u6761\u4ef6\u7684\u5316\u5408\u7269", + "\u6210\u529f\u8fd4\u56de\u5bf9\u5e94\u6587\u4ef6URL" ], - "initial_question": "\u5728 Alexandria \u4e2d\u68c0\u7d22\u5e76\u8fd4\u56de\u4e0d\u8d85\u8fc7 2 \u4e2a\u4ec5\u542b Ti\u3001Al\u3001O \u4e14\u7a7a\u95f4\u7fa4\u4e3a 123 \u5e26\u9699 \u22651.5 eV \u7684\u5316\u5408\u7269\uff0c\u5e76\u5bfc\u51fa CIF \u6587\u4ef6", + "initial_question": "\u5728 Materials Project \u4e2d\u68c0\u7d22\u5e76\u8fd4\u56de\u4e0d\u8d85\u8fc7 2 \u4e2a\u4ec5\u542b Ti\u3001Al\u3001O \u4e14\u7a7a\u95f4\u7fa4\u4e3a 63 \u7684\u5316\u5408\u7269\uff0c\u5e76\u5bfc\u51fa CIF \u6587\u4ef6", "success_criteria": [ "\u7ed3\u679c\u53ea\u542b Ti\u3001Al\u3001O", - "\u7a7a\u95f4\u7fa4\u5747\u4e3a 123", - "\u5e26\u9699\u5747 \u22651.5 eV", - "CIF \u6587\u4ef6\u5b8c\u6574\u5bfc\u51fa", + "\u7a7a\u95f4\u7fa4\u5747\u4e3a 63", + "\u6587\u4ef6 URL \u5b8c\u6574", "\u7ed3\u679c\u6570\u91cf\u6b63\u786e" ] }, @@ -49,46 +43,39 @@ "expected_outcomes": [ "\u5728 Alexandria \u4e2d\u68c0\u7d22", "\u627e\u5230 Alexandria \u4e2d\u7684 Heusler \u5408\u91d1", - "\u5bfc\u51fa\u78c1\u77e9\u6570\u636e", - "\u5bf9\u78c1\u77e9\u6570\u636e\u6392\u5e8f", - "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6" + "\u6210\u529f\u5bfc\u8fd4\u56de\u6587\u4ef6 URL" ], - "initial_question": "\u5728 Alexandria \u4e2d\u968f\u673a\u68c0\u7d22 5 \u4e2a\u5df2\u77e5\u7684 Heusler \u5408\u91d1\uff0c\u5bfc\u51fa\u5176\u78c1\u77e9\u6570\u636e\u5e76\u4ece\u5c0f\u5230\u5927\u6392\u5e8f", + "initial_question": "\u5728 Alexandria \u4e2d\u968f\u673a\u68c0\u7d22 5 \u4e2a\u5df2\u77e5\u7684 Heusler \u5408\u91d1", "success_criteria": [ "\u7ed3\u679c\u5747\u4e3a Heusler \u5408\u91d1", - "\u78c1\u77e9\u6570\u636e\u5b8c\u6574\u5bfc\u51fa", - "\u6392\u5e8f\u7ed3\u679c\u6b63\u786e", - "CIF \u6587\u4ef6\u5b8c\u6574\u5bfc\u51fa", + "\u6587\u4ef6 URL \u5b8c\u6574", "\u7ed3\u679c\u6570\u91cf\u6b63\u786e" ] }, { "expected_outcomes": [ "\u5728 COD \u4e2d\u68c0\u7d22", - "\u627e\u5230\u7a7a\u95f4\u7fa4 225 \u7684 TiO\u2082 \u7ed3\u6784", - "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6" + "\u627e\u5230\u7a7a\u95f4\u7fa4 61 \u7684 \u7ed3\u6784", + "\u6210\u529f\u8fd4\u56de\u5bf9\u5e94\u6587\u4ef6URL" ], - "initial_question": "\u5728 COD \u4e2d\u67e5\u627e\u5e76\u8fd4\u56de\u4e0d\u8d85\u8fc7 5 \u4e2a\u7a7a\u95f4\u7fa4\u4e3a 225 \u7684 TiO\u2082 \u7ed3\u6784\uff0c\u5e76\u5bfc\u51fa CIF \u6587\u4ef6", + "initial_question": "\u5728 COD \u4e2d\u67e5\u627e\u5e76\u8fd4\u56de\u4e0d\u8d85\u8fc7 5 \u4e2a\u7a7a\u95f4\u7fa4\u4e3a 61 \u7684\u7ed3\u6784\uff0c\u5e76\u8fd4\u56de\u6587\u4ef6\u4e0b\u8f7d\u94fe\u63a5", "success_criteria": [ "\u6240\u6709\u7ed3\u679c\u5747\u6765\u81eaCOD", - "\u6240\u6709\u7ed3\u679c\u7a7a\u95f4\u7fa4\u4e3a 225", - "\u5168\u90e8\u7ed3\u679c\u4e3a TiO\u2082", - "CIF \u6587\u4ef6\u5bfc\u51fa\u5b8c\u6574", + "\u6240\u6709\u7ed3\u679c\u7a7a\u95f4\u7fa4\u4e3a 61", + "\u6587\u4ef6 URL \u5b8c\u6574", "\u7ed3\u679c\u6570\u91cf\u6b63\u786e" ] }, { "expected_outcomes": [ "\u627e\u5230\u539f\u5b50\u6570 \u22644 \u7684 BCC \u91d1\u5c5e", - "\u5bfc\u51fa CIF \u6587\u4ef6", - "\u8ba1\u7b97\u5e73\u5747\u6676\u80de\u4f53\u79ef" + "\u6210\u529f\u8fd4\u56de\u5bf9\u5e94\u6587\u4ef6URL" ], - "initial_question": "\u5728 COD \u4e2d\u68c0\u7d22\u539f\u5b50\u6570\u4e0d\u8d85\u8fc7 4 \u7684 BCC \u91d1\u5c5e\uff0c\u5e76\u5bfc\u51fa CIF \u6587\u4ef6\uff0c\u540c\u65f6\u7edf\u8ba1\u5e73\u5747\u6676\u80de\u4f53\u79ef", + "initial_question": "\u5728 COD \u4e2d\u68c0\u7d22\u539f\u5b50\u6570\u4e0d\u8d85\u8fc7 4 \u7684 BCC \u91d1\u5c5e\uff0c\u5e76\u8fd4\u56de\u6587\u4ef6\u4e0b\u8f7d\u94fe\u63a5", "success_criteria": [ "\u6240\u6709\u7ed3\u679c\u4e3a BCC \u91d1\u5c5e", "\u539f\u5b50\u6570 \u22644", - "CIF \u6587\u4ef6\u5bfc\u51fa\u5b8c\u6574", - "\u5e73\u5747\u6676\u80de\u4f53\u79ef\u8ba1\u7b97\u6b63\u786e", + "\u6587\u4ef6 URL \u5b8c\u6574", "\u7ed3\u679c\u6570\u91cf\u6b63\u786e" ] }, @@ -96,28 +83,14 @@ "expected_outcomes": [ "\u5728 COD \u4e2d\u68c0\u7d22", "\u83b7\u5f97\u7b26\u5408\u6761\u4ef6\u7684\u4e09\u5143\u5316\u5408\u7269", - "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6" + "\u6210\u529f\u8fd4\u56de\u6587\u4ef6 URL" ], "initial_question": "\u5728 COD \u4e2d\u68c0\u7d22\u5e76\u8fd4\u56de\u4e0d\u8d85\u8fc7 5 \u4e2a\u542b\u7a00\u571f\u3001\u8fc7\u6e21\u91d1\u5c5e\u548c\u6c27\uff0c\u5e76\u4e14\u4e0d\u5305\u542b Fe \u548c Ni \u7684\u4e09\u5143\u5316\u5408\u7269", "success_criteria": [ "\u6240\u6709\u7ed3\u679c\u5747\u6765\u81eaCOD", "\u5168\u90e8\u7ed3\u679c\u542b\u7a00\u571f\u3001\u8fc7\u6e21\u91d1\u5c5e\u548c\u6c27", "\u7ed3\u679c\u4e0d\u542b Fe \u548c Ni", - "CIF \u6587\u4ef6\u5bfc\u51fa\u5b8c\u6574", - "\u7ed3\u679c\u6570\u91cf\u6b63\u786e" - ] - }, - { - "expected_outcomes": [ - "\u7b5b\u9009\u51fa\u7b26\u5408\u5f39\u6027\u6a21\u91cf\u548c\u6cca\u677e\u6bd4\u6761\u4ef6\u7684\u65e0\u673a\u6750\u6599", - "\u83b7\u5f97\u5bf9\u5e94\u529b\u5b66\u6027\u8d28\u4fe1\u606f", - "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6" - ], - "initial_question": "\u5728 Materials Project \u4e2d\u67e5\u627e\u5e76\u8fd4\u56de 4 \u4e2a\u5f39\u6027\u6a21\u91cf\u5927\u4e8e 300 GPa \u4e14\u6cca\u677e\u6bd4\u5c0f\u4e8e 0.25 \u7684\u65e0\u673a\u6750\u6599", - "success_criteria": [ - "\u6240\u6709\u7ed3\u679c\u5f39\u6027\u6a21\u91cf >300 GPa", - "\u6cca\u677e\u6bd4 <0.25", - "CIF \u6587\u4ef6\u5b8c\u6574\u5bfc\u51fa", + "\u6587\u4ef6 URL \u5b8c\u6574", "\u7ed3\u679c\u6570\u91cf\u6b63\u786e" ] }, @@ -126,13 +99,13 @@ "\u5728Materials Project\u4e2d\u68c0\u7d22", "\u68c0\u7d22\u5230\u542b Si \u548c O \u7684\u56db\u5143\u5316\u5408\u7269", "\u6392\u9664 Fe \u548c Ni \u5143\u7d20", - "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6" + "\u6210\u529f\u8fd4\u56de\u5bf9\u5e94\u7684\u6587\u4ef6 URL" ], - "initial_question": "\u5728 Materials Project \u4e2d\u68c0\u7d22\u542b\u6709 Si \u548c O \u5207\u4e0d\u542b\u6709 Fe \u548c Ni \u7684\u56db\u5143\u5316\u5408\u7269\uff0c\u8fd4\u56de\u4e0d\u8d85\u8fc7 5 \u4e2a\u7ed3\u679c", + "initial_question": "\u5728 Materials Project \u4e2d\u68c0\u7d22\u542b\u6709 Si \u548c O \u4e14\u4e0d\u542b Fe \u548c Ni \u7684\u56db\u5143\u5316\u5408\u7269\uff0c\u8fd4\u56de\u4e0d\u8d85\u8fc7 5 \u4e2a\u7ed3\u679c", "success_criteria": [ "\u6240\u6709\u7ed3\u679c\u5747\u7b26\u5408\u8981\u6c42", "\u6240\u6709\u7ed3\u679c\u5747\u6765\u81eaMaterials Project", - "CIF \u6587\u4ef6\u5bfc\u51fa\u5b8c\u6574", + "\u6587\u4ef6 URL \u5b8c\u6574", "\u7ed3\u679c\u6570\u91cf\u6b63\u786e" ] }, @@ -140,9 +113,9 @@ "expected_outcomes": [ "\u83b7\u53d6 Alexandria \u4e0e COD \u4e2d MoS\u2082 \u5c42\u72b6\u6750\u6599\u7684\u6570\u636e", "\u5bf9\u6bd4\u5e26\u9699\u548c DOS", - "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6" + "\u6210\u529f\u8fd4\u56de CIF \u6587\u4ef6 URL" ], - "initial_question": "\u5bf9\u6bd4 Alexandria \u4e0e COD \u4e2d MoS\u2082 \u5c42\u72b6\u6750\u6599\u7684\u5e26\u9699\u548c DOS", + "initial_question": "\u5bf9\u6bd4 Alexandria \u4e0e COD \u4e2d MoS\u2082 \u5c42\u72b6\u6750\u6599\u7684\u5e26\u9699", "success_criteria": [ "\u6240\u6709\u7ed3\u679c\u5747\u4e3a MoS\u2082 \u5c42\u72b6\u6750\u6599", "\u5e26\u9699\u548c DOS \u5bf9\u6bd4\u5b8c\u6574", @@ -151,70 +124,67 @@ }, { "expected_outcomes": [ - "\u4ece\u4e24\u4e2a\u6570\u636e\u5e93\u83b7\u53d6 Fe\u2082O\u2083 \u7684\u5f62\u6210\u80fd\u548c\u5e26\u9699", - "\u8fdb\u884c\u5bf9\u6bd4\u5206\u6790", - "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6" + "\u4ece\u4e24\u4e2a\u4ee5\u4e0a\u7684\u6570\u636e\u5e93\u4e2d\u83b7\u53d6 Fe\u2082O\u2083 \u7684\u7ed3\u6784\u548c\u5e26\u9699", + "\u8fdb\u884c\u6570\u503c\u5bf9\u6bd4\u5206\u6790\uff0c\u4e0d\u9700\u8981\u7ed8\u56fe", + "\u6210\u529f\u8fd4\u56de\u6587\u4ef6 URL" ], - "initial_question": "\u5bf9\u6bd4 Materials Project \u4e0e Alexandria \u4e2d Fe\u2082O\u2083 \u7684\u5f62\u6210\u80fd\u548c\u5e26\u9699\u6570\u636e", + "initial_question": "\u5bf9\u6bd4\u4e0d\u540c\u6570\u636e\u5e93\u4e2d Fe\u2082O\u2083 \u7684\u5e26\u9699\u6570\u636e", "success_criteria": [ - "\u7ed3\u679c\u5305\u542b Fe\u2082O\u2083 \u5728\u4e24\u4e2a\u6570\u636e\u5e93\u4e2d\u7684\u6570\u636e", - "\u5f62\u6210\u80fd\u548c\u5e26\u9699\u6570\u503c\u5bf9\u6bd4\u6e05\u6670", - "CIF \u6587\u4ef6\u5b8c\u6574\u5bfc\u51fa" + "\u7ed3\u679c\u5305\u542b Fe\u2082O\u2083 \u5728\u591a\u4e2a\u6570\u636e\u5e93\u4e2d\u7684\u6570\u636e", + "\u5e26\u9699\u6570\u503c\u5bf9\u6bd4\u6e05\u6670", + "\u6587\u4ef6URL\u5b8c\u6574\u5bfc\u51fa" ] }, { "expected_outcomes": [ "\u4e0b\u8f7d\u542b\u9502\u6750\u6599\u7684 CIF \u6587\u4ef6", "\u8ba1\u7b97\u5e73\u5747\u4f53\u79ef/\u539f\u5b50", - "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6" + "\u6210\u529f\u8fd4\u56de CIF \u6587\u4ef6 URL" ], "initial_question": "\u6279\u91cf\u4e0b\u8f7d Materials Project \u4e2d\u524d 5 \u4e2a\u542b\u9502\u6750\u6599\u7684 CIF \u6587\u4ef6\uff0c\u5e76\u7edf\u8ba1\u5176\u5e73\u5747\u4f53\u79ef/\u539f\u5b50", "success_criteria": [ "\u4e0b\u8f7d\u7684 CIF \u6587\u4ef6\u6570\u91cf\u4e3a 5", "\u5e73\u5747\u4f53\u79ef/\u539f\u5b50\u8ba1\u7b97\u6b63\u786e", - "CIF \u6587\u4ef6\u5b8c\u6574\u5bfc\u51fa", + "\u6587\u4ef6URL\u5b8c\u6574\u5bfc\u51fa", "\u7ed3\u679c\u6570\u91cf\u6b63\u786e" ] }, { "expected_outcomes": [ "\u627e\u5230\u5c42\u72b6\u8fc7\u6e21\u91d1\u5c5e\u786b\u5316\u7269", - "\u7b5b\u9009\u5c42\u95f4\u7ed3\u5408\u80fd \u226450 meV/\u00c5\u00b2 \u7684\u5b50\u96c6", - "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6" + "\u6210\u529f\u8fd4\u56de\u6587\u4ef6 URL" ], - "initial_question": "\u68c0\u7d22\u5c42\u72b6\u8fc7\u6e21\u91d1\u5c5e\u786b\u5316\u7269\uff0c\u7b5b\u9009\u5e76\u8fd4\u56de 4 \u4e2a\u5c42\u95f4\u7ed3\u5408\u80fd\u4e0d\u8d85\u8fc7 50 meV/\u00c5\u00b2 \u7684\u7ed3\u679c", + "initial_question": "\u68c0\u7d22\u5e76\u8fd4\u56de 4 \u4e2a\u5c42\u72b6\u8fc7\u6e21\u91d1\u5c5e\u786b\u5316\u7269\u7684\u7ed3\u6784", "success_criteria": [ "\u5168\u90e8\u7ed3\u679c\u4e3a\u5c42\u72b6\u8fc7\u6e21\u91d1\u5c5e\u786b\u5316\u7269", - "\u5c42\u95f4\u7ed3\u5408\u80fd \u226450 meV/\u00c5\u00b2", - "CIF \u6587\u4ef6\u5b8c\u6574\u5bfc\u51fa", "\u7ed3\u679c\u6570\u91cf\u6b63\u786e" ] }, { "expected_outcomes": [ - "\u627e\u5230\u5e38\u89c1 MXene \u7ed3\u6784", + "\u627e\u5230 MXene \u7ed3\u6784", "\u7b5b\u9009\u51fa\u8868\u9762\u7ec8\u6b62\u57fa\u56e2\u4e3a O \u6216 F \u7684\u7ed3\u6784", - "\u5bfc\u51fa CIF \u6587\u4ef6" + "\u5bfc\u51fa\u6587\u4ef6" ], - "initial_question": "\u68c0\u7d22\u5e38\u89c1\u7684 3 \u4e2a MXene \u7ed3\u6784\uff0c\u7b5b\u9009\u8868\u9762\u7ec8\u6b62\u57fa\u56e2\u4e3a O \u6216 F \u7684\u7ed3\u679c\uff0c\u5e76\u5bfc\u51fa CIF \u6587\u4ef6", + "initial_question": "\u68c0\u7d22 3 \u4e2a\u542b\u6709 O \u6216 F \u7684 MXene \u7ed3\u6784\uff0c\u5e76\u8fd4\u56de\u6587\u4ef6URL", "success_criteria": [ "\u5168\u90e8\u7ed3\u679c\u4e3a MXene", "\u7b5b\u9009\u51fa\u7684\u7ed3\u679c\u8868\u9762\u7ec8\u6b62\u57fa\u56e2\u4e3a O \u6216 F", - "CIF \u6587\u4ef6\u5b8c\u6574\u5bfc\u51fa", + "URL\u5b8c\u6574\u5bfc\u51fa", "\u7ed3\u679c\u6570\u91cf\u6b63\u786e" ] }, { "expected_outcomes": [ - "\u627e\u5230\u4e8c\u7ef4\u5c42\u72b6\u6750\u6599", + "\u627e\u5230\u5c42\u72b6\u6750\u6599", "\u7b5b\u9009\u5e26\u9699 <1 eV \u7684\u5b50\u96c6", - "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6" + "\u6210\u529f\u8fd4\u56deURL" ], - "initial_question": "\u68c0\u7d22\u5e76\u7b5b\u9009\u7981\u5e26\u5bbd\u5ea6\u5c0f\u4e8e 1 eV \u7684\u4e8c\u7ef4\u5c42\u72b6\u6750\u6599\uff0c\u8fd4\u56de 5 \u4e2a\u7ed3\u679c", + "initial_question": "\u68c0\u7d22\u7981\u5e26\u5bbd\u5ea6\u5c0f\u4e8e 1 eV \u7684\u5c42\u72b6\u6750\u6599\uff0c\u8fd4\u56de 5 \u4e2a\u7ed3\u679c", "success_criteria": [ - "\u7ed3\u679c\u5747\u4e3a\u4e8c\u7ef4\u5c42\u72b6\u6750\u6599", + "\u7ed3\u679c\u5747\u4e3a\u5c42\u72b6\u6750\u6599", "\u5e26\u9699 <1 eV", - "CIF \u6587\u4ef6\u5b8c\u6574\u5bfc\u51fa", + "\u6587\u4ef6URL\u5b8c\u6574\u5bfc\u51fa", "\u7ed3\u679c\u6570\u91cf\u6b63\u786e" ] } diff --git a/evaluate/experiments/threads/database_search/database_search_bash.py b/evaluate/experiments/threads/database_search/database_search_bash.py new file mode 100644 index 00000000..70c4f715 --- /dev/null +++ b/evaluate/experiments/threads/database_search/database_search_bash.py @@ -0,0 +1,14 @@ +import asyncio +import argparse +from evaluate.base.evaluation import evaluation_threads_single_task + +if __name__ == '__main__': + # 运行测试 + print('🚀 人类模拟器启动') + print('=' * 50) + parser = argparse.ArgumentParser() + parser.add_argument('--max_turn_count', type=int, default=10, help='最大对话轮数') + parser.add_argument('--item_id', type=int, default=0, help='样本索引') + args = parser.parse_args() + + asyncio.run(evaluation_threads_single_task('database_search.json', item_id=args.item_id, max_turn_count=args.max_turn_count)) diff --git a/evaluate/experiments/threads/database_search/run.sh b/evaluate/experiments/threads/database_search/run.sh new file mode 100755 index 00000000..a6572de6 --- /dev/null +++ b/evaluate/experiments/threads/database_search/run.sh @@ -0,0 +1,39 @@ +#!/bin/bash +PYTHON=.venv/bin/python # your .venv +set -a +source .env # your .env +set +a + +export PYTHONPATH=/your/matmaster/path/MatMaster:$PYTHONPATH +export MAX_JOBS=3 + +TOTAL=$($PYTHON -c " +import os +import json +with open('database_search.json') as f: + dataset_json = json.load(f) +print(len(dataset_json)) +") + +echo '总数据量:' $TOTAL + +running_jobs=0 + +for ((i=0; i<$TOTAL; i++)); do + echo "🚀 提交任务: item $i" + sleep 3 + $PYTHON database_search_bash.py \ + --item_id $i > item_$i.log 2>&1 & + + ((running_jobs++)) + + # 如果正在运行的任务数达到上限,就等待任意一个完成 + if (( running_jobs >= MAX_JOBS )); then + wait -n + ((running_jobs--)) + fi +done + +# 等待最后一批任务 +wait +echo "✅ 所有任务完成" diff --git a/evaluate/experiments/threads/structure_generate/run.sh b/evaluate/experiments/threads/structure_generate/run.sh new file mode 100755 index 00000000..7f5e39bb --- /dev/null +++ b/evaluate/experiments/threads/structure_generate/run.sh @@ -0,0 +1,39 @@ +#!/bin/bash +PYTHON=.venv/bin/python # your .venv +set -a +source .env # your .env +set +a + +export PYTHONPATH=/your/matmaster/path/MatMaster:$PYTHONPATH +export MAX_JOBS=3 + +TOTAL=$($PYTHON -c " +import os +import json +with open('structure_generate.json') as f: + dataset_json = json.load(f) +print(len(dataset_json)) +") + +echo '总数据量:' $TOTAL + +running_jobs=0 + +for ((i=0; i<$TOTAL; i++)); do + echo "🚀 提交任务: item $i" + sleep 3 + $PYTHON structure_generate_bash.py \ + --item_id $i > item_$i.log 2>&1 & + + ((running_jobs++)) + + # 如果正在运行的任务数达到上限,就等待任意一个完成 + if (( running_jobs >= MAX_JOBS )); then + wait -n + ((running_jobs--)) + fi +done + +# 等待最后一批任务 +wait +echo "✅ 所有任务完成" diff --git a/evaluate/experiments/threads/structure_generate/structure_generate.json b/evaluate/experiments/threads/structure_generate/structure_generate.json index a4469b34..8f014835 100644 --- a/evaluate/experiments/threads/structure_generate/structure_generate.json +++ b/evaluate/experiments/threads/structure_generate/structure_generate.json @@ -88,19 +88,6 @@ "CIF \u6587\u4ef6\u5bfc\u51fa\u5b8c\u6574" ] }, - { - "expected_outcomes": [ - "\u63a2\u7d22 Al\u2013Cu\u2013Fe \u4f53\u7cfb\u4e2d\u53ef\u80fd\u7684\u51c6\u6676\u6216\u51c6\u5468\u671f\u6784\u578b\u5019\u9009", - "\u5bf9\u5019\u9009\u7ed3\u6784\u8fdb\u884c\u5bf9\u79f0\u6027\u5206\u6790\uff08\u5982\u65cb\u8f6c\u5bf9\u79f0\u3001\u957f\u7a0b\u6709\u5e8f/\u65e0\u5468\u671f\u6027\u7279\u5f81\uff09", - "\u63d0\u4f9b\u53ef\u89c6\u5316/\u63cf\u8ff0\u6027\u5206\u6790\u7ed3\u679c\u548c\u7ed3\u6784\u6587\u4ef6" - ], - "initial_question": "\u9884\u6d4b Al\u2013Cu\u2013Fe \u4e09\u5143\u4f53\u7cfb\u53ef\u80fd\u5f62\u6210\u7684\u51c6\u6676\u7ed3\u6784\uff0c\u5e76\u5206\u6790\u5176\u5bf9\u79f0\u6027\u7279\u5f81", - "success_criteria": [ - "\u751f\u6210\u6216\u8bc6\u522b\u82e5\u5e72\u53ef\u80fd\u7684\u51c6\u6676\u5019\u9009\u7ed3\u6784\uff08\u6216\u7ed9\u51fa\u4e0d\u5b58\u5728\u7684\u7ed3\u8bba\uff09", - "\u5b8c\u6210\u5bf9\u79f0\u6027\u7279\u5f81\u7684\u5b9a\u6027\u4e0e\u91cf\u5316\u5206\u6790\uff08\u4f8b\u5982\u5b58\u5728\u7684\u65cb\u8f6c\u5bf9\u79f0\u9636\u6570\u3001\u7f3a\u4e4f\u5e73\u79fb\u5468\u671f\u6027\u6216\u8fd1\u4f3c\u8d85\u6676\u683c\u63cf\u8ff0\uff09", - "\u63d0\u4f9b\u53ef\u5bfc\u51fa\u7684\u7ed3\u6784\u8868\u793a\u6216\u793a\u610f\u6587\u4ef6" - ] - }, { "expected_outcomes": [ "\u751f\u6210 Li\u2013S \u591a\u786b\u5316\u7269\u7684\u5019\u9009\u7ed3\u6784\u96c6\u5408", diff --git a/evaluate/experiments/threads/structure_generate/structure_generate_bash.py b/evaluate/experiments/threads/structure_generate/structure_generate_bash.py new file mode 100644 index 00000000..a09fc0b1 --- /dev/null +++ b/evaluate/experiments/threads/structure_generate/structure_generate_bash.py @@ -0,0 +1,14 @@ +import asyncio +import argparse +from evaluate.base.evaluation import evaluation_threads_single_task + +if __name__ == '__main__': + # 运行测试 + print('🚀 人类模拟器启动') + print('=' * 50) + parser = argparse.ArgumentParser() + parser.add_argument('--max_turn_count', type=int, default=20, help='最大对话轮数') + parser.add_argument('--item_id', type=int, default=0, help='样本索引') + args = parser.parse_args() + + asyncio.run(evaluation_threads_single_task('structure_generate.json', item_id=args.item_id, max_turn_count=args.max_turn_count)) From d8d220c0acabe7a06d6e8478bb372cb1909704fc Mon Sep 17 00:00:00 2001 From: beform88 <2298266722@qq.com> Date: Tue, 16 Sep 2025 14:37:39 +0000 Subject: [PATCH 2/2] modified: evaluate/base/evaluation.py --- evaluate/base/evaluation.py | 328 ++++++++++-------------------------- 1 file changed, 90 insertions(+), 238 deletions(-) diff --git a/evaluate/base/evaluation.py b/evaluate/base/evaluation.py index a6b0f571..cfe3e058 100644 --- a/evaluate/base/evaluation.py +++ b/evaluate/base/evaluation.py @@ -115,191 +115,20 @@ def multi_turn_evaluation_task(dataset_item): return result -async def evaluation_threads_task(file_path, max_turn_count=10): - """与ADK agent进行多轮对话测试""" - print('=' * 80) - print('🤖 与ADK Agent多轮对话测试') - print('=' * 80) - - dataset_json = json.loads(load_dataset_json(file_path)) - eval_results = [] - for index, dataset_item in enumerate(dataset_json): - time.sleep(10) # 避免请求过于频繁 - session_service = InMemorySessionService() - session = await session_service.create_session( - app_name='matmaster_agent', - user_id='human_simulator_test', - ) - - logger.info(f"Test Session: {session.id}") - - runner = Runner( - app_name='matmaster_agent', - agent=root_agent, - session_service=session_service - ) - - # 创建人类模拟器 - simulator = HumanSimulator(max_turn_count=max_turn_count) - - # 数据预处理 - scenario = { - 'name': dataset_item['initial_question'], - 'goal': ConversationGoal( - initial_question=dataset_item['initial_question'], - expected_outcomes=dataset_item['expected_outcomes'], - success_criteria=dataset_item['success_criteria'] - )} - - print(f"\n{'=' * 20} 测试场景: {scenario['name']} {'=' * 20}") - - # 设置对话目标 - simulator.set_goal(scenario['goal']) - initial_question = simulator.get_initial_question() - - print(f"🎯 对话目标: {initial_question}") - print(f"📋 期望结果: {', '.join(scenario['goal'].expected_outcomes)}") - print(f"✅ 成功标准: {', '.join(scenario['goal'].success_criteria)}") - - # 初始化记录 - eval_results.append({}) - eval_results[index]['initial_question'] = initial_question - eval_results[index]['expected_outcomes'] = scenario['goal'].expected_outcomes - eval_results[index]['success_criteria'] = scenario['goal'].success_criteria - for i in range(1, 6): - eval_results[index][f'agent_response_{i}'] = '' - eval_results[index][f'user_response_{i}'] = '' - - # 开始对话 - conversation_ended = False - turn_count = 0 - - while not conversation_ended and turn_count < 10: - turn_count += 1 - print(f"\n🔄 第 {turn_count} 轮对话:") - - # 获取用户输入(从模拟器) - if turn_count == 1: - user_input = initial_question - else: - # 从模拟器获取响应 - user_input = simulator.get_last_user_response() - - print(f"🧑 模拟用户: {user_input}") - - # 调用ADK agent - try: - content = types.Content(role='user', parts=[types.Part(text=user_input)]) - - agent_response = '' - - events = runner.run_async( - user_id=session.user_id, - session_id=session.id, - new_message=content, - run_config=RunConfig(streaming_mode=StreamingMode.SSE) - ) - - # 收集agent响应 - async for event in events: - if event.content and event.content.parts: - for part in event.content.parts: - if part.text: - agent_response += part.text - except asyncio.CancelledError: - logger.error('任务被取消,可能是超时或作用域取消导致') - print(f"✅ 对话在第{turn_count}轮结束") - eval_results[index][f'agent_response_{turn_count}'] = '任务被取消,可能是超时或作用域取消导致' - break - except Exception as e: - logger.error(f"获取agent响应失败: {e}") - print(f"✅ 对话在第{turn_count}轮结束") - eval_results[index][f'agent_response_{turn_count}'] = str(e) - break - - eval_results[index][f'agent_response_{turn_count}'] = agent_response - print(f"🤖 ADK Agent: {agent_response}") - - job_jsons = re.findall(r'(.*?)', agent_response) - job_ids = [] - if job_jsons: - for job_json in job_jsons: - try: - job_json = json.loads(job_json) - if 'eventData' in job_json and 'content' in job_json['eventData']: - content = job_json['eventData']['content'] - if 'job_list' in content and 'job_id' in content['job_list']: - job_id = content['job_list']['job_id'] - job_ids.append(job_id) - except Exception as e: - logger.error(f"提取job_id失败: {e}") - - # 查询job状态 - if job_ids: - job_ids = list(set(job_ids)) - while True: - time.sleep(10) - all_finished = True - for job_id in job_ids: - bohrium_client = Bohrium() - job_info = bohrium_client.job.detail(job_id) - logger.info(f"查询到job状态: {job_id} - 状态: {job_info["status"]}") - if job_info['status'] not in [-1, 2]: - all_finished = False - if all_finished: - break - - # 使用模拟器生成用户响应 - user_response, should_continue = simulator.get_bohr_results(agent_response, job_ids) - eval_results[index][f'user_response_{turn_count}'] = user_response - print(f"🧑 模拟用户: {user_response}") - else: - # 使用模拟器生成用户响应 - user_response, should_continue = simulator.generate_response(agent_response) - eval_results[index][f'user_response_{turn_count}'] = user_response - print(f"🧑 模拟用户: {user_response}") - - if not should_continue: - print(f"✅ 对话在第{turn_count}轮结束") - break - - # 获取对话摘要 - summary = simulator.get_conversation_summary() - eval_results[index]['total_turns'] = summary['total_turns'] - eval_results[index]['final_state'] = summary['final_state'] - eval_results[index]['duration_minutes'] = summary['duration_minutes'] - print(f"\n📊 对话摘要:") - print(f" - 总轮次: {summary['total_turns']}") - print(f" - 最终状态: {summary['final_state']}") - print(f" - 耗时: {summary['duration_minutes']:.1f} 分钟") - - with open('evaluation_results.json', 'w') as f: - json.dump(eval_results, f, indent=4, ensure_ascii=False) - - # 简单的成功判断 - if summary['final_state'] == 'satisfied': - print('✅ 测试通过: 对话成功完成') - else: - print('❌ 测试失败: 对话未成功完成') - - await runner.close() - - print('\n' + '=' * 80) - print('🎉 多轮对话测试完成!') - print('=' * 80) - - -async def evaluation_threads_single_task(file_path, item_id, max_turn_count=10): - """与ADK agent进行多轮对话测试""" - print('=' * 80) - print('🤖 与ADK Agent多轮对话测试') - print('=' * 80) +import asyncio +import json +import re +import time +from typing import Dict, Any, List - dataset_json = json.loads(load_dataset_json(file_path)) - eval_results = {} - dataset_item = dataset_json[item_id] - time.sleep(10) # 避免请求过于频繁 +async def _run_conversation(dataset_item: Dict[str, Any], max_turn_count: int, save_mode: str = 'w') -> Dict[str, Any]: + """ + 执行一次对话测试,并返回结果 + :param dataset_item: 单条测试数据 + :param max_turn_count: 最大对话轮次 + :param save_mode: 写文件模式 ("w" 覆盖 / "a" 追加) + """ session_service = InMemorySessionService() session = await session_service.create_session( app_name='matmaster_agent', @@ -314,21 +143,19 @@ async def evaluation_threads_single_task(file_path, item_id, max_turn_count=10): session_service=session_service ) - # 创建人类模拟器 simulator = HumanSimulator(max_turn_count=max_turn_count) - # 数据预处理 + # 场景初始化 scenario = { 'name': dataset_item['initial_question'], 'goal': ConversationGoal( initial_question=dataset_item['initial_question'], expected_outcomes=dataset_item['expected_outcomes'], success_criteria=dataset_item['success_criteria'] - )} - + ) + } print(f"\n{'=' * 20} 测试场景: {scenario['name']} {'=' * 20}") - # 设置对话目标 simulator.set_goal(scenario['goal']) initial_question = simulator.get_initial_question() @@ -336,35 +163,29 @@ async def evaluation_threads_single_task(file_path, item_id, max_turn_count=10): print(f"📋 期望结果: {', '.join(scenario['goal'].expected_outcomes)}") print(f"✅ 成功标准: {', '.join(scenario['goal'].success_criteria)}") - # 初始化记录 - eval_results['initial_question'] = initial_question - eval_results['expected_outcomes'] = scenario['goal'].expected_outcomes - eval_results['success_criteria'] = scenario['goal'].success_criteria - for i in range(1, max_turn_count+1): + # 初始化结果 + eval_results = { + 'initial_question': initial_question, + 'expected_outcomes': scenario['goal'].expected_outcomes, + 'success_criteria': scenario['goal'].success_criteria, + } + for i in range(1, max_turn_count + 1): eval_results[f'agent_response_{i}'] = '' eval_results[f'user_response_{i}'] = '' - # 开始对话 - conversation_ended = False + # 对话循环 turn_count = 0 - - while not conversation_ended and turn_count < max_turn_count: + while turn_count < max_turn_count: turn_count += 1 print(f"\n🔄 第 {turn_count} 轮对话:") - # 获取用户输入(从模拟器) - if turn_count == 1: - user_input = initial_question - else: - # 从模拟器获取响应 - user_input = simulator.get_last_user_response() - + # 获取用户输入 + user_input = initial_question if turn_count == 1 else simulator.get_last_user_response() print(f"🧑 模拟用户: {user_input}") - # 调用ADK agent + # 调用 agent try: content = types.Content(role='user', parts=[types.Part(text=user_input)]) - agent_response = '' events = runner.run_async( @@ -374,41 +195,38 @@ async def evaluation_threads_single_task(file_path, item_id, max_turn_count=10): run_config=RunConfig(streaming_mode=StreamingMode.SSE) ) - # 收集agent响应 async for event in events: if event.content and event.content.parts: for part in event.content.parts: if part.text: agent_response += part.text except asyncio.CancelledError: - logger.error('任务被取消,可能是超时或作用域取消导致') - print(f"✅ 对话在第{turn_count}轮结束") - eval_results[f'agent_response_{turn_count}'] = '任务被取消,可能是超时或作用域取消导致' + msg = '任务被取消,可能是超时或作用域取消导致' + logger.error(msg) + eval_results[f'agent_response_{turn_count}'] = msg break except Exception as e: logger.error(f"获取agent响应失败: {e}") - print(f"✅ 对话在第{turn_count}轮结束") eval_results[f'agent_response_{turn_count}'] = str(e) break eval_results[f'agent_response_{turn_count}'] = agent_response print(f"🤖 ADK Agent: {agent_response}") + # 提取 job_id job_jsons = re.findall(r'(.*?)', agent_response) - job_ids = [] - if job_jsons: - for job_json in job_jsons: - try: - job_json = json.loads(job_json) - if 'eventData' in job_json and 'content' in job_json['eventData']: - content = job_json['eventData']['content'] - if 'job_list' in content and 'job_id' in content['job_list']: - job_id = content['job_list']['job_id'] - job_ids.append(job_id) - except Exception as e: - logger.error(f"提取job_id失败: {e}") - - # 查询job状态 + job_ids: List[str] = [] + for job_json in job_jsons: + try: + job_json = json.loads(job_json) + if 'eventData' in job_json and 'content' in job_json['eventData']: + content = job_json['eventData']['content'] + if 'job_list' in content and 'job_id' in content['job_list']: + job_ids.append(content['job_list']['job_id']) + except Exception as e: + logger.error(f"提取job_id失败: {e}") + + # 查询 job 状态 if job_ids: job_ids = list(set(job_ids)) while True: @@ -417,47 +235,81 @@ async def evaluation_threads_single_task(file_path, item_id, max_turn_count=10): for job_id in job_ids: bohrium_client = Bohrium() job_info = bohrium_client.job.detail(job_id) - logger.info(f"查询到job状态: {job_id} - 状态: {job_info["status"]}") + logger.info(f"查询到job状态: {job_id} - 状态: {job_info['status']}") if job_info['status'] not in [-1, 2]: all_finished = False if all_finished: break - # 使用模拟器生成用户响应 user_response, should_continue = simulator.get_bohr_results(agent_response, job_ids) - eval_results[f'user_response_{turn_count}'] = user_response - print(f"🧑 模拟用户: {user_response}") else: - # 使用模拟器生成用户响应 user_response, should_continue = simulator.generate_response(agent_response) - eval_results[f'user_response_{turn_count}'] = user_response - print(f"🧑 模拟用户: {user_response}") + + eval_results[f'user_response_{turn_count}'] = user_response + print(f"🧑 模拟用户: {user_response}") if not should_continue: print(f"✅ 对话在第{turn_count}轮结束") break - # 获取对话摘要 + # 对话总结 summary = simulator.get_conversation_summary() - eval_results['total_turns'] = summary['total_turns'] - eval_results['final_state'] = summary['final_state'] - eval_results['duration_minutes'] = summary['duration_minutes'] + eval_results.update({ + 'total_turns': summary['total_turns'], + 'final_state': summary['final_state'], + 'duration_minutes': summary['duration_minutes'], + }) + print(f"\n📊 对话摘要:") print(f" - 总轮次: {summary['total_turns']}") print(f" - 最终状态: {summary['final_state']}") print(f" - 耗时: {summary['duration_minutes']:.1f} 分钟") - with open('evaluation_results.json', 'a') as f: + # 保存结果 + with open('evaluation_results.json', save_mode) as f: json.dump(eval_results, f, indent=4, ensure_ascii=False) - # 简单的成功判断 if summary['final_state'] == 'satisfied': print('✅ 测试通过: 对话成功完成') else: print('❌ 测试失败: 对话未成功完成') await runner.close() + return eval_results + + +async def evaluation_threads_task(file_path: str, max_turn_count: int = 10): + """批量测试所有数据""" + print('=' * 80) + print('🤖 与ADK Agent多轮对话测试') + print('=' * 80) + + dataset_json = json.loads(load_dataset_json(file_path)) + results = [] + for i, dataset_item in enumerate(dataset_json): + time.sleep(10) # 避免请求过于频繁 + result = await _run_conversation(dataset_item, max_turn_count, save_mode='w' if i == 0 else 'a') + results.append(result) print('\n' + '=' * 80) print('🎉 多轮对话测试完成!') print('=' * 80) + return results + + +async def evaluation_threads_single_task(file_path: str, item_id: int, max_turn_count: int = 10): + """测试单个数据""" + print('=' * 80) + print('🤖 与ADK Agent多轮对话测试') + print('=' * 80) + + dataset_json = json.loads(load_dataset_json(file_path)) + dataset_item = dataset_json[item_id] + time.sleep(10) # 避免请求过于频繁 + + result = await _run_conversation(dataset_item, max_turn_count, save_mode='a') + + print('\n' + '=' * 80) + print('🎉 单条多轮对话测试完成!') + print('=' * 80) + return result