diff --git a/evaluate/base/evaluation.py b/evaluate/base/evaluation.py
index 48796a28..cfe3e058 100644
--- a/evaluate/base/evaluation.py
+++ b/evaluate/base/evaluation.py
@@ -115,175 +115,201 @@ def multi_turn_evaluation_task(dataset_item):
return result
-async def evaluation_threads_task(file_path, max_turn_count=10):
- """与ADK agent进行多轮对话测试"""
- print('=' * 80)
- print('🤖 与ADK Agent多轮对话测试')
- print('=' * 80)
+import asyncio
+import json
+import re
+import time
+from typing import Dict, Any, List
- dataset_json = json.loads(load_dataset_json(file_path))
- eval_results = []
- for index, dataset_item in enumerate(dataset_json):
- time.sleep(10) # 避免请求过于频繁
- session_service = InMemorySessionService()
- session = await session_service.create_session(
- app_name='matmaster_agent',
- user_id='human_simulator_test',
+
+async def _run_conversation(dataset_item: Dict[str, Any], max_turn_count: int, save_mode: str = 'w') -> Dict[str, Any]:
+ """
+ 执行一次对话测试,并返回结果
+ :param dataset_item: 单条测试数据
+ :param max_turn_count: 最大对话轮次
+ :param save_mode: 写文件模式 ("w" 覆盖 / "a" 追加)
+ """
+ session_service = InMemorySessionService()
+ session = await session_service.create_session(
+ app_name='matmaster_agent',
+ user_id='human_simulator_test',
+ )
+
+ logger.info(f"Test Session: {session.id}")
+
+ runner = Runner(
+ app_name='matmaster_agent',
+ agent=root_agent,
+ session_service=session_service
+ )
+
+ simulator = HumanSimulator(max_turn_count=max_turn_count)
+
+ # 场景初始化
+ scenario = {
+ 'name': dataset_item['initial_question'],
+ 'goal': ConversationGoal(
+ initial_question=dataset_item['initial_question'],
+ expected_outcomes=dataset_item['expected_outcomes'],
+ success_criteria=dataset_item['success_criteria']
)
+ }
+ print(f"\n{'=' * 20} 测试场景: {scenario['name']} {'=' * 20}")
- logger.info(f"Test Session: {session.id}")
+ simulator.set_goal(scenario['goal'])
+ initial_question = simulator.get_initial_question()
- runner = Runner(
- app_name='matmaster_agent',
- agent=root_agent,
- session_service=session_service
- )
+ print(f"🎯 对话目标: {initial_question}")
+ print(f"📋 期望结果: {', '.join(scenario['goal'].expected_outcomes)}")
+ print(f"✅ 成功标准: {', '.join(scenario['goal'].success_criteria)}")
+
+ # 初始化结果
+ eval_results = {
+ 'initial_question': initial_question,
+ 'expected_outcomes': scenario['goal'].expected_outcomes,
+ 'success_criteria': scenario['goal'].success_criteria,
+ }
+ for i in range(1, max_turn_count + 1):
+ eval_results[f'agent_response_{i}'] = ''
+ eval_results[f'user_response_{i}'] = ''
+
+ # 对话循环
+ turn_count = 0
+ while turn_count < max_turn_count:
+ turn_count += 1
+ print(f"\n🔄 第 {turn_count} 轮对话:")
+
+ # 获取用户输入
+ user_input = initial_question if turn_count == 1 else simulator.get_last_user_response()
+ print(f"🧑 模拟用户: {user_input}")
+
+ # 调用 agent
+ try:
+ content = types.Content(role='user', parts=[types.Part(text=user_input)])
+ agent_response = ''
+
+ events = runner.run_async(
+ user_id=session.user_id,
+ session_id=session.id,
+ new_message=content,
+ run_config=RunConfig(streaming_mode=StreamingMode.SSE)
+ )
+
+ async for event in events:
+ if event.content and event.content.parts:
+ for part in event.content.parts:
+ if part.text:
+ agent_response += part.text
+ except asyncio.CancelledError:
+ msg = '任务被取消,可能是超时或作用域取消导致'
+ logger.error(msg)
+ eval_results[f'agent_response_{turn_count}'] = msg
+ break
+ except Exception as e:
+ logger.error(f"获取agent响应失败: {e}")
+ eval_results[f'agent_response_{turn_count}'] = str(e)
+ break
- # 创建人类模拟器
- simulator = HumanSimulator(max_turn_count=max_turn_count)
-
- # 数据预处理
- scenario = {
- 'name': dataset_item['initial_question'],
- 'goal': ConversationGoal(
- initial_question=dataset_item['initial_question'],
- expected_outcomes=dataset_item['expected_outcomes'],
- success_criteria=dataset_item['success_criteria']
- )}
-
- print(f"\n{'=' * 20} 测试场景: {scenario['name']} {'=' * 20}")
-
- # 设置对话目标
- simulator.set_goal(scenario['goal'])
- initial_question = simulator.get_initial_question()
-
- print(f"🎯 对话目标: {initial_question}")
- print(f"📋 期望结果: {', '.join(scenario['goal'].expected_outcomes)}")
- print(f"✅ 成功标准: {', '.join(scenario['goal'].success_criteria)}")
-
- # 初始化记录
- eval_results.append({})
- eval_results[index]['initial_question'] = initial_question
- eval_results[index]['expected_outcomes'] = scenario['goal'].expected_outcomes
- eval_results[index]['success_criteria'] = scenario['goal'].success_criteria
- for i in range(1, 6):
- eval_results[index][f'agent_response_{i}'] = ''
- eval_results[index][f'user_response_{i}'] = ''
-
- # 开始对话
- conversation_ended = False
- turn_count = 0
-
- while not conversation_ended and turn_count < 10:
- turn_count += 1
- print(f"\n🔄 第 {turn_count} 轮对话:")
-
- # 获取用户输入(从模拟器)
- if turn_count == 1:
- user_input = initial_question
- else:
- # 从模拟器获取响应
- user_input = simulator.get_last_user_response()
-
- print(f"🧑 模拟用户: {user_input}")
-
- # 调用ADK agent
+ eval_results[f'agent_response_{turn_count}'] = agent_response
+ print(f"🤖 ADK Agent: {agent_response}")
+
+ # 提取 job_id
+ job_jsons = re.findall(r'(.*?)', agent_response)
+ job_ids: List[str] = []
+ for job_json in job_jsons:
try:
- content = types.Content(role='user', parts=[types.Part(text=user_input)])
-
- agent_response = ''
-
- events = runner.run_async(
- user_id=session.user_id,
- session_id=session.id,
- new_message=content,
- run_config=RunConfig(streaming_mode=StreamingMode.SSE)
- )
-
- # 收集agent响应
- async for event in events:
- if event.content and event.content.parts:
- for part in event.content.parts:
- if part.text:
- agent_response += part.text
- except asyncio.CancelledError:
- logger.error('任务被取消,可能是超时或作用域取消导致')
- print(f"✅ 对话在第{turn_count}轮结束")
- eval_results[index][f'agent_response_{turn_count}'] = '任务被取消,可能是超时或作用域取消导致'
- break
+ job_json = json.loads(job_json)
+ if 'eventData' in job_json and 'content' in job_json['eventData']:
+ content = job_json['eventData']['content']
+ if 'job_list' in content and 'job_id' in content['job_list']:
+ job_ids.append(content['job_list']['job_id'])
except Exception as e:
- logger.error(f"获取agent响应失败: {e}")
- print(f"✅ 对话在第{turn_count}轮结束")
- eval_results[index][f'agent_response_{turn_count}'] = str(e)
- break
-
- eval_results[index][f'agent_response_{turn_count}'] = agent_response
- print(f"🤖 ADK Agent: {agent_response}")
-
- job_jsons = re.findall(r'(.*?)', agent_response)
- job_ids = []
- if job_jsons:
- for job_json in job_jsons:
- try:
- job_json = json.loads(job_json)
- if 'eventData' in job_json and 'content' in job_json['eventData']:
- content = job_json['eventData']['content']
- if 'job_list' in content and 'job_id' in content['job_list']:
- job_id = content['job_list']['job_id']
- job_ids.append(job_id)
- except Exception as e:
- logger.error(f"提取job_id失败: {e}")
-
- # 查询job状态
- if job_ids:
- job_ids = list(set(job_ids))
- while True:
- time.sleep(10)
- all_finished = True
- for job_id in job_ids:
- bohrium_client = Bohrium()
- job_info = bohrium_client.job.detail(job_id)
- logger.info(f"查询到job状态: {job_id} - 状态: {job_info["status"]}")
- if job_info['status'] not in [-1, 2]:
- all_finished = False
- if all_finished:
- break
-
- # 使用模拟器生成用户响应
- user_response, should_continue = simulator.get_bohr_results(agent_response, job_ids)
- eval_results[index][f'user_response_{turn_count}'] = user_response
- print(f"🧑 模拟用户: {user_response}")
- else:
- # 使用模拟器生成用户响应
- user_response, should_continue = simulator.generate_response(agent_response)
- eval_results[index][f'user_response_{turn_count}'] = user_response
- print(f"🧑 模拟用户: {user_response}")
-
- if not should_continue:
- print(f"✅ 对话在第{turn_count}轮结束")
- break
-
- # 获取对话摘要
- summary = simulator.get_conversation_summary()
- eval_results[index]['total_turns'] = summary['total_turns']
- eval_results[index]['final_state'] = summary['final_state']
- eval_results[index]['duration_minutes'] = summary['duration_minutes']
- print(f"\n📊 对话摘要:")
- print(f" - 总轮次: {summary['total_turns']}")
- print(f" - 最终状态: {summary['final_state']}")
- print(f" - 耗时: {summary['duration_minutes']:.1f} 分钟")
-
- with open('evaluation_results.json', 'w') as f:
- json.dump(eval_results, f, indent=4, ensure_ascii=False)
-
- # 简单的成功判断
- if summary['final_state'] == 'satisfied':
- print('✅ 测试通过: 对话成功完成')
+ logger.error(f"提取job_id失败: {e}")
+
+ # 查询 job 状态
+ if job_ids:
+ job_ids = list(set(job_ids))
+ while True:
+ time.sleep(10)
+ all_finished = True
+ for job_id in job_ids:
+ bohrium_client = Bohrium()
+ job_info = bohrium_client.job.detail(job_id)
+ logger.info(f"查询到job状态: {job_id} - 状态: {job_info['status']}")
+ if job_info['status'] not in [-1, 2]:
+ all_finished = False
+ if all_finished:
+ break
+
+ user_response, should_continue = simulator.get_bohr_results(agent_response, job_ids)
else:
- print('❌ 测试失败: 对话未成功完成')
+ user_response, should_continue = simulator.generate_response(agent_response)
+
+ eval_results[f'user_response_{turn_count}'] = user_response
+ print(f"🧑 模拟用户: {user_response}")
- await runner.close()
+ if not should_continue:
+ print(f"✅ 对话在第{turn_count}轮结束")
+ break
+
+ # 对话总结
+ summary = simulator.get_conversation_summary()
+ eval_results.update({
+ 'total_turns': summary['total_turns'],
+ 'final_state': summary['final_state'],
+ 'duration_minutes': summary['duration_minutes'],
+ })
+
+ print(f"\n📊 对话摘要:")
+ print(f" - 总轮次: {summary['total_turns']}")
+ print(f" - 最终状态: {summary['final_state']}")
+ print(f" - 耗时: {summary['duration_minutes']:.1f} 分钟")
+
+ # 保存结果
+ with open('evaluation_results.json', save_mode) as f:
+ json.dump(eval_results, f, indent=4, ensure_ascii=False)
+
+ if summary['final_state'] == 'satisfied':
+ print('✅ 测试通过: 对话成功完成')
+ else:
+ print('❌ 测试失败: 对话未成功完成')
+
+ await runner.close()
+ return eval_results
+
+
+async def evaluation_threads_task(file_path: str, max_turn_count: int = 10):
+ """批量测试所有数据"""
+ print('=' * 80)
+ print('🤖 与ADK Agent多轮对话测试')
+ print('=' * 80)
+
+ dataset_json = json.loads(load_dataset_json(file_path))
+ results = []
+ for i, dataset_item in enumerate(dataset_json):
+ time.sleep(10) # 避免请求过于频繁
+ result = await _run_conversation(dataset_item, max_turn_count, save_mode='w' if i == 0 else 'a')
+ results.append(result)
print('\n' + '=' * 80)
print('🎉 多轮对话测试完成!')
print('=' * 80)
+ return results
+
+
+async def evaluation_threads_single_task(file_path: str, item_id: int, max_turn_count: int = 10):
+ """测试单个数据"""
+ print('=' * 80)
+ print('🤖 与ADK Agent多轮对话测试')
+ print('=' * 80)
+
+ dataset_json = json.loads(load_dataset_json(file_path))
+ dataset_item = dataset_json[item_id]
+ time.sleep(10) # 避免请求过于频繁
+
+ result = await _run_conversation(dataset_item, max_turn_count, save_mode='a')
+
+ print('\n' + '=' * 80)
+ print('🎉 单条多轮对话测试完成!')
+ print('=' * 80)
+ return result
diff --git a/evaluate/base/human_simulator.py b/evaluate/base/human_simulator.py
index 07fa3d1a..ccb58461 100644
--- a/evaluate/base/human_simulator.py
+++ b/evaluate/base/human_simulator.py
@@ -142,20 +142,22 @@ def _build_response_prompt(self, agent_message: str) -> str:
Agent最新回复:
{agent_message}
-请分析agent的回复是否满足任务需求,并生成合适的响应。
+请分析agent的回复是否满足 初始问题 需求。如果回复大致符合初始任务要求,请结束对话。如果不符合,请分析不符合的点在哪儿,并生成简洁的用户回复,继续引导agent完成任务。
重要限制:
-- 对话最多{self.max_turn_count}轮,当前是第{self.turn_count}轮
-- 除首轮对话外,其他轮次尽可能简短地回答agent的问题,回复内容紧扣初始问题,禁止发散
-- 如果agent在询问具体参数或设置,提供简洁明确的回答
-- 如果agent已经提供了初始任务所需的信息或完成了任务,请立刻结束对话
-- 禁止回复可能导致agent产生误解或偏离目标的内容
-
-请以JSON格式回复:
+- 对话最多{self.max_turn_count}轮,当前是第{self.turn_count}轮;
+- 尽可能简短地回答agent的问题,回复内容紧扣初始问题,禁止发散,避免执行轮数超出限制;
+- 如果agent在询问具体参数或设置,提供简洁明确的回答;
+- 如果agent明确指出当前任务无法完成,请礼貌地结束对话;
+- 如果agent已经提供了初始任务所需的信息或完成了任务,请立刻结束对话;
+- agent仅能以文本形式回复,禁止要求agent提供可视化结果;
+
+请以如下JSON格式回复:
{{
"response": "你的回复内容",
"continue": true/false // 是否继续对话
}}
+
"""
def get_bohr_results(self, agent_message: str, job_id: List[str]) -> Tuple[str, bool]:
diff --git a/evaluate/experiments/threads/database_search/database_search.json b/evaluate/experiments/threads/database_search/database_search.json
index 32fbe34a..d84d9840 100644
--- a/evaluate/experiments/threads/database_search/database_search.json
+++ b/evaluate/experiments/threads/database_search/database_search.json
@@ -1,47 +1,41 @@
[
{
"expected_outcomes": [
- "\u627e\u5230 COD \u4e2d\u5e26\u9699 >2 eV \u7684 direct \u6c27\u5316\u7269",
- "\u83b7\u5f97\u7ed3\u6784\u4fe1\u606f",
- "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6"
+ "\u627e\u5230 Materials Project \u4e2d\u5e26\u9699 >2 eV \u7684 \u6c27\u5316\u7269"
],
- "initial_question": "\u4ece COD \u4e2d\u641c\u7d22\u5e76\u8fd4\u56de\u4e09\u4e2a\u5e26\u9699\u5927\u4e8e 2 eV \u4e14\u4e3a direct \u7c7b\u578b\u7684\u6c27\u5316\u7269\u7ed3\u6784",
+ "initial_question": "\u5728 Materials Project \u4e2d\u68c0\u7d22\u5e76\u8fd4\u56de3\u4e2a\u5e26\u9699\u5927\u4e8e 2 eV \u7684\u6c27\u5316\u7269\u7ed3\u6784",
"success_criteria": [
"\u6240\u6709\u7ed3\u679c\u5747\u4e3a\u6c27\u5316\u7269",
- "\u5e26\u9699 >2 eV",
- "\u5e26\u9699\u7c7b\u578b\u4e3a direct",
- "CIF \u6587\u4ef6\u5b8c\u6574\u5bfc\u51fa",
+ "\u8fd4\u56de\u6587\u4ef6URL",
"\u7ed3\u679c\u6570\u91cf\u6b63\u786e"
]
},
{
"expected_outcomes": [
"\u5728 Alexandria \u4e2d\u68c0\u7d22",
- "\u627e\u5230\u6ee1\u8db3\u5143\u7d20\u548c\u5e26\u9699\u8303\u56f4\u7684\u6750\u6599",
+ "\u627e\u5230\u6ee1\u8db3\u5143\u7d20\u7684\u6750\u6599",
"\u83b7\u5f97\u5bf9\u5e94\u7684\u6750\u6599\u4fe1\u606f",
- "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6"
+ "\u6210\u529f\u8fd4\u56de\u5bf9\u5e94\u6587\u4ef6URL"
],
- "initial_question": "\u5728 Alexandria \u4e2d\u67e5\u627e\u5e76\u8fd4\u56de 5 \u4e2a\u542b Li\u3001Mn\u3001O \u4e14\u5e26\u9699\u4f4d\u4e8e 2\u20133 eV \u8303\u56f4\u7684\u6750\u6599",
+ "initial_question": "\u5728 Alexandria \u4e2d\u67e5\u627e\u5e76\u8fd4\u56de 5 \u4e2a\u542b Li\u3001Mn\u3001O \u7684\u6750\u6599",
"success_criteria": [
"\u6240\u6709\u7ed3\u679c\u5747\u6765\u81eaAlexandria",
"\u6240\u6709\u7ed3\u679c\u5747\u542b Li\u3001Mn\u3001O",
- "\u5e26\u9699\u5728 2\u20133 eV \u5185",
- "CIF \u6587\u4ef6\u5bfc\u51fa\u5b8c\u6574",
+ "\u6587\u4ef6 URL \u5b8c\u6574",
"\u7ed3\u679c\u6570\u91cf\u6b63\u786e"
]
},
{
"expected_outcomes": [
- "\u5728 Alexandria \u4e2d\u68c0\u7d22",
- "\u627e\u5230\u7b26\u5408\u5143\u7d20\u548c\u7a7a\u95f4\u7fa4\u6761\u4ef6\u7684\u5c0f\u5e26\u9699\u5316\u5408\u7269",
- "\u5bfc\u51fa CIF \u6587\u4ef6"
+ "\u5728 Materials Project \u4e2d\u68c0\u7d22",
+ "\u627e\u5230\u7b26\u5408\u5143\u7d20\u548c\u7a7a\u95f4\u7fa4\u6761\u4ef6\u7684\u5316\u5408\u7269",
+ "\u6210\u529f\u8fd4\u56de\u5bf9\u5e94\u6587\u4ef6URL"
],
- "initial_question": "\u5728 Alexandria \u4e2d\u68c0\u7d22\u5e76\u8fd4\u56de\u4e0d\u8d85\u8fc7 2 \u4e2a\u4ec5\u542b Ti\u3001Al\u3001O \u4e14\u7a7a\u95f4\u7fa4\u4e3a 123 \u5e26\u9699 \u22651.5 eV \u7684\u5316\u5408\u7269\uff0c\u5e76\u5bfc\u51fa CIF \u6587\u4ef6",
+ "initial_question": "\u5728 Materials Project \u4e2d\u68c0\u7d22\u5e76\u8fd4\u56de\u4e0d\u8d85\u8fc7 2 \u4e2a\u4ec5\u542b Ti\u3001Al\u3001O \u4e14\u7a7a\u95f4\u7fa4\u4e3a 63 \u7684\u5316\u5408\u7269\uff0c\u5e76\u5bfc\u51fa CIF \u6587\u4ef6",
"success_criteria": [
"\u7ed3\u679c\u53ea\u542b Ti\u3001Al\u3001O",
- "\u7a7a\u95f4\u7fa4\u5747\u4e3a 123",
- "\u5e26\u9699\u5747 \u22651.5 eV",
- "CIF \u6587\u4ef6\u5b8c\u6574\u5bfc\u51fa",
+ "\u7a7a\u95f4\u7fa4\u5747\u4e3a 63",
+ "\u6587\u4ef6 URL \u5b8c\u6574",
"\u7ed3\u679c\u6570\u91cf\u6b63\u786e"
]
},
@@ -49,46 +43,39 @@
"expected_outcomes": [
"\u5728 Alexandria \u4e2d\u68c0\u7d22",
"\u627e\u5230 Alexandria \u4e2d\u7684 Heusler \u5408\u91d1",
- "\u5bfc\u51fa\u78c1\u77e9\u6570\u636e",
- "\u5bf9\u78c1\u77e9\u6570\u636e\u6392\u5e8f",
- "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6"
+ "\u6210\u529f\u5bfc\u8fd4\u56de\u6587\u4ef6 URL"
],
- "initial_question": "\u5728 Alexandria \u4e2d\u968f\u673a\u68c0\u7d22 5 \u4e2a\u5df2\u77e5\u7684 Heusler \u5408\u91d1\uff0c\u5bfc\u51fa\u5176\u78c1\u77e9\u6570\u636e\u5e76\u4ece\u5c0f\u5230\u5927\u6392\u5e8f",
+ "initial_question": "\u5728 Alexandria \u4e2d\u968f\u673a\u68c0\u7d22 5 \u4e2a\u5df2\u77e5\u7684 Heusler \u5408\u91d1",
"success_criteria": [
"\u7ed3\u679c\u5747\u4e3a Heusler \u5408\u91d1",
- "\u78c1\u77e9\u6570\u636e\u5b8c\u6574\u5bfc\u51fa",
- "\u6392\u5e8f\u7ed3\u679c\u6b63\u786e",
- "CIF \u6587\u4ef6\u5b8c\u6574\u5bfc\u51fa",
+ "\u6587\u4ef6 URL \u5b8c\u6574",
"\u7ed3\u679c\u6570\u91cf\u6b63\u786e"
]
},
{
"expected_outcomes": [
"\u5728 COD \u4e2d\u68c0\u7d22",
- "\u627e\u5230\u7a7a\u95f4\u7fa4 225 \u7684 TiO\u2082 \u7ed3\u6784",
- "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6"
+ "\u627e\u5230\u7a7a\u95f4\u7fa4 61 \u7684 \u7ed3\u6784",
+ "\u6210\u529f\u8fd4\u56de\u5bf9\u5e94\u6587\u4ef6URL"
],
- "initial_question": "\u5728 COD \u4e2d\u67e5\u627e\u5e76\u8fd4\u56de\u4e0d\u8d85\u8fc7 5 \u4e2a\u7a7a\u95f4\u7fa4\u4e3a 225 \u7684 TiO\u2082 \u7ed3\u6784\uff0c\u5e76\u5bfc\u51fa CIF \u6587\u4ef6",
+ "initial_question": "\u5728 COD \u4e2d\u67e5\u627e\u5e76\u8fd4\u56de\u4e0d\u8d85\u8fc7 5 \u4e2a\u7a7a\u95f4\u7fa4\u4e3a 61 \u7684\u7ed3\u6784\uff0c\u5e76\u8fd4\u56de\u6587\u4ef6\u4e0b\u8f7d\u94fe\u63a5",
"success_criteria": [
"\u6240\u6709\u7ed3\u679c\u5747\u6765\u81eaCOD",
- "\u6240\u6709\u7ed3\u679c\u7a7a\u95f4\u7fa4\u4e3a 225",
- "\u5168\u90e8\u7ed3\u679c\u4e3a TiO\u2082",
- "CIF \u6587\u4ef6\u5bfc\u51fa\u5b8c\u6574",
+ "\u6240\u6709\u7ed3\u679c\u7a7a\u95f4\u7fa4\u4e3a 61",
+ "\u6587\u4ef6 URL \u5b8c\u6574",
"\u7ed3\u679c\u6570\u91cf\u6b63\u786e"
]
},
{
"expected_outcomes": [
"\u627e\u5230\u539f\u5b50\u6570 \u22644 \u7684 BCC \u91d1\u5c5e",
- "\u5bfc\u51fa CIF \u6587\u4ef6",
- "\u8ba1\u7b97\u5e73\u5747\u6676\u80de\u4f53\u79ef"
+ "\u6210\u529f\u8fd4\u56de\u5bf9\u5e94\u6587\u4ef6URL"
],
- "initial_question": "\u5728 COD \u4e2d\u68c0\u7d22\u539f\u5b50\u6570\u4e0d\u8d85\u8fc7 4 \u7684 BCC \u91d1\u5c5e\uff0c\u5e76\u5bfc\u51fa CIF \u6587\u4ef6\uff0c\u540c\u65f6\u7edf\u8ba1\u5e73\u5747\u6676\u80de\u4f53\u79ef",
+ "initial_question": "\u5728 COD \u4e2d\u68c0\u7d22\u539f\u5b50\u6570\u4e0d\u8d85\u8fc7 4 \u7684 BCC \u91d1\u5c5e\uff0c\u5e76\u8fd4\u56de\u6587\u4ef6\u4e0b\u8f7d\u94fe\u63a5",
"success_criteria": [
"\u6240\u6709\u7ed3\u679c\u4e3a BCC \u91d1\u5c5e",
"\u539f\u5b50\u6570 \u22644",
- "CIF \u6587\u4ef6\u5bfc\u51fa\u5b8c\u6574",
- "\u5e73\u5747\u6676\u80de\u4f53\u79ef\u8ba1\u7b97\u6b63\u786e",
+ "\u6587\u4ef6 URL \u5b8c\u6574",
"\u7ed3\u679c\u6570\u91cf\u6b63\u786e"
]
},
@@ -96,28 +83,14 @@
"expected_outcomes": [
"\u5728 COD \u4e2d\u68c0\u7d22",
"\u83b7\u5f97\u7b26\u5408\u6761\u4ef6\u7684\u4e09\u5143\u5316\u5408\u7269",
- "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6"
+ "\u6210\u529f\u8fd4\u56de\u6587\u4ef6 URL"
],
"initial_question": "\u5728 COD \u4e2d\u68c0\u7d22\u5e76\u8fd4\u56de\u4e0d\u8d85\u8fc7 5 \u4e2a\u542b\u7a00\u571f\u3001\u8fc7\u6e21\u91d1\u5c5e\u548c\u6c27\uff0c\u5e76\u4e14\u4e0d\u5305\u542b Fe \u548c Ni \u7684\u4e09\u5143\u5316\u5408\u7269",
"success_criteria": [
"\u6240\u6709\u7ed3\u679c\u5747\u6765\u81eaCOD",
"\u5168\u90e8\u7ed3\u679c\u542b\u7a00\u571f\u3001\u8fc7\u6e21\u91d1\u5c5e\u548c\u6c27",
"\u7ed3\u679c\u4e0d\u542b Fe \u548c Ni",
- "CIF \u6587\u4ef6\u5bfc\u51fa\u5b8c\u6574",
- "\u7ed3\u679c\u6570\u91cf\u6b63\u786e"
- ]
- },
- {
- "expected_outcomes": [
- "\u7b5b\u9009\u51fa\u7b26\u5408\u5f39\u6027\u6a21\u91cf\u548c\u6cca\u677e\u6bd4\u6761\u4ef6\u7684\u65e0\u673a\u6750\u6599",
- "\u83b7\u5f97\u5bf9\u5e94\u529b\u5b66\u6027\u8d28\u4fe1\u606f",
- "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6"
- ],
- "initial_question": "\u5728 Materials Project \u4e2d\u67e5\u627e\u5e76\u8fd4\u56de 4 \u4e2a\u5f39\u6027\u6a21\u91cf\u5927\u4e8e 300 GPa \u4e14\u6cca\u677e\u6bd4\u5c0f\u4e8e 0.25 \u7684\u65e0\u673a\u6750\u6599",
- "success_criteria": [
- "\u6240\u6709\u7ed3\u679c\u5f39\u6027\u6a21\u91cf >300 GPa",
- "\u6cca\u677e\u6bd4 <0.25",
- "CIF \u6587\u4ef6\u5b8c\u6574\u5bfc\u51fa",
+ "\u6587\u4ef6 URL \u5b8c\u6574",
"\u7ed3\u679c\u6570\u91cf\u6b63\u786e"
]
},
@@ -126,13 +99,13 @@
"\u5728Materials Project\u4e2d\u68c0\u7d22",
"\u68c0\u7d22\u5230\u542b Si \u548c O \u7684\u56db\u5143\u5316\u5408\u7269",
"\u6392\u9664 Fe \u548c Ni \u5143\u7d20",
- "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6"
+ "\u6210\u529f\u8fd4\u56de\u5bf9\u5e94\u7684\u6587\u4ef6 URL"
],
- "initial_question": "\u5728 Materials Project \u4e2d\u68c0\u7d22\u542b\u6709 Si \u548c O \u5207\u4e0d\u542b\u6709 Fe \u548c Ni \u7684\u56db\u5143\u5316\u5408\u7269\uff0c\u8fd4\u56de\u4e0d\u8d85\u8fc7 5 \u4e2a\u7ed3\u679c",
+ "initial_question": "\u5728 Materials Project \u4e2d\u68c0\u7d22\u542b\u6709 Si \u548c O \u4e14\u4e0d\u542b Fe \u548c Ni \u7684\u56db\u5143\u5316\u5408\u7269\uff0c\u8fd4\u56de\u4e0d\u8d85\u8fc7 5 \u4e2a\u7ed3\u679c",
"success_criteria": [
"\u6240\u6709\u7ed3\u679c\u5747\u7b26\u5408\u8981\u6c42",
"\u6240\u6709\u7ed3\u679c\u5747\u6765\u81eaMaterials Project",
- "CIF \u6587\u4ef6\u5bfc\u51fa\u5b8c\u6574",
+ "\u6587\u4ef6 URL \u5b8c\u6574",
"\u7ed3\u679c\u6570\u91cf\u6b63\u786e"
]
},
@@ -140,9 +113,9 @@
"expected_outcomes": [
"\u83b7\u53d6 Alexandria \u4e0e COD \u4e2d MoS\u2082 \u5c42\u72b6\u6750\u6599\u7684\u6570\u636e",
"\u5bf9\u6bd4\u5e26\u9699\u548c DOS",
- "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6"
+ "\u6210\u529f\u8fd4\u56de CIF \u6587\u4ef6 URL"
],
- "initial_question": "\u5bf9\u6bd4 Alexandria \u4e0e COD \u4e2d MoS\u2082 \u5c42\u72b6\u6750\u6599\u7684\u5e26\u9699\u548c DOS",
+ "initial_question": "\u5bf9\u6bd4 Alexandria \u4e0e COD \u4e2d MoS\u2082 \u5c42\u72b6\u6750\u6599\u7684\u5e26\u9699",
"success_criteria": [
"\u6240\u6709\u7ed3\u679c\u5747\u4e3a MoS\u2082 \u5c42\u72b6\u6750\u6599",
"\u5e26\u9699\u548c DOS \u5bf9\u6bd4\u5b8c\u6574",
@@ -151,70 +124,67 @@
},
{
"expected_outcomes": [
- "\u4ece\u4e24\u4e2a\u6570\u636e\u5e93\u83b7\u53d6 Fe\u2082O\u2083 \u7684\u5f62\u6210\u80fd\u548c\u5e26\u9699",
- "\u8fdb\u884c\u5bf9\u6bd4\u5206\u6790",
- "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6"
+ "\u4ece\u4e24\u4e2a\u4ee5\u4e0a\u7684\u6570\u636e\u5e93\u4e2d\u83b7\u53d6 Fe\u2082O\u2083 \u7684\u7ed3\u6784\u548c\u5e26\u9699",
+ "\u8fdb\u884c\u6570\u503c\u5bf9\u6bd4\u5206\u6790\uff0c\u4e0d\u9700\u8981\u7ed8\u56fe",
+ "\u6210\u529f\u8fd4\u56de\u6587\u4ef6 URL"
],
- "initial_question": "\u5bf9\u6bd4 Materials Project \u4e0e Alexandria \u4e2d Fe\u2082O\u2083 \u7684\u5f62\u6210\u80fd\u548c\u5e26\u9699\u6570\u636e",
+ "initial_question": "\u5bf9\u6bd4\u4e0d\u540c\u6570\u636e\u5e93\u4e2d Fe\u2082O\u2083 \u7684\u5e26\u9699\u6570\u636e",
"success_criteria": [
- "\u7ed3\u679c\u5305\u542b Fe\u2082O\u2083 \u5728\u4e24\u4e2a\u6570\u636e\u5e93\u4e2d\u7684\u6570\u636e",
- "\u5f62\u6210\u80fd\u548c\u5e26\u9699\u6570\u503c\u5bf9\u6bd4\u6e05\u6670",
- "CIF \u6587\u4ef6\u5b8c\u6574\u5bfc\u51fa"
+ "\u7ed3\u679c\u5305\u542b Fe\u2082O\u2083 \u5728\u591a\u4e2a\u6570\u636e\u5e93\u4e2d\u7684\u6570\u636e",
+ "\u5e26\u9699\u6570\u503c\u5bf9\u6bd4\u6e05\u6670",
+ "\u6587\u4ef6URL\u5b8c\u6574\u5bfc\u51fa"
]
},
{
"expected_outcomes": [
"\u4e0b\u8f7d\u542b\u9502\u6750\u6599\u7684 CIF \u6587\u4ef6",
"\u8ba1\u7b97\u5e73\u5747\u4f53\u79ef/\u539f\u5b50",
- "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6"
+ "\u6210\u529f\u8fd4\u56de CIF \u6587\u4ef6 URL"
],
"initial_question": "\u6279\u91cf\u4e0b\u8f7d Materials Project \u4e2d\u524d 5 \u4e2a\u542b\u9502\u6750\u6599\u7684 CIF \u6587\u4ef6\uff0c\u5e76\u7edf\u8ba1\u5176\u5e73\u5747\u4f53\u79ef/\u539f\u5b50",
"success_criteria": [
"\u4e0b\u8f7d\u7684 CIF \u6587\u4ef6\u6570\u91cf\u4e3a 5",
"\u5e73\u5747\u4f53\u79ef/\u539f\u5b50\u8ba1\u7b97\u6b63\u786e",
- "CIF \u6587\u4ef6\u5b8c\u6574\u5bfc\u51fa",
+ "\u6587\u4ef6URL\u5b8c\u6574\u5bfc\u51fa",
"\u7ed3\u679c\u6570\u91cf\u6b63\u786e"
]
},
{
"expected_outcomes": [
"\u627e\u5230\u5c42\u72b6\u8fc7\u6e21\u91d1\u5c5e\u786b\u5316\u7269",
- "\u7b5b\u9009\u5c42\u95f4\u7ed3\u5408\u80fd \u226450 meV/\u00c5\u00b2 \u7684\u5b50\u96c6",
- "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6"
+ "\u6210\u529f\u8fd4\u56de\u6587\u4ef6 URL"
],
- "initial_question": "\u68c0\u7d22\u5c42\u72b6\u8fc7\u6e21\u91d1\u5c5e\u786b\u5316\u7269\uff0c\u7b5b\u9009\u5e76\u8fd4\u56de 4 \u4e2a\u5c42\u95f4\u7ed3\u5408\u80fd\u4e0d\u8d85\u8fc7 50 meV/\u00c5\u00b2 \u7684\u7ed3\u679c",
+ "initial_question": "\u68c0\u7d22\u5e76\u8fd4\u56de 4 \u4e2a\u5c42\u72b6\u8fc7\u6e21\u91d1\u5c5e\u786b\u5316\u7269\u7684\u7ed3\u6784",
"success_criteria": [
"\u5168\u90e8\u7ed3\u679c\u4e3a\u5c42\u72b6\u8fc7\u6e21\u91d1\u5c5e\u786b\u5316\u7269",
- "\u5c42\u95f4\u7ed3\u5408\u80fd \u226450 meV/\u00c5\u00b2",
- "CIF \u6587\u4ef6\u5b8c\u6574\u5bfc\u51fa",
"\u7ed3\u679c\u6570\u91cf\u6b63\u786e"
]
},
{
"expected_outcomes": [
- "\u627e\u5230\u5e38\u89c1 MXene \u7ed3\u6784",
+ "\u627e\u5230 MXene \u7ed3\u6784",
"\u7b5b\u9009\u51fa\u8868\u9762\u7ec8\u6b62\u57fa\u56e2\u4e3a O \u6216 F \u7684\u7ed3\u6784",
- "\u5bfc\u51fa CIF \u6587\u4ef6"
+ "\u5bfc\u51fa\u6587\u4ef6"
],
- "initial_question": "\u68c0\u7d22\u5e38\u89c1\u7684 3 \u4e2a MXene \u7ed3\u6784\uff0c\u7b5b\u9009\u8868\u9762\u7ec8\u6b62\u57fa\u56e2\u4e3a O \u6216 F \u7684\u7ed3\u679c\uff0c\u5e76\u5bfc\u51fa CIF \u6587\u4ef6",
+ "initial_question": "\u68c0\u7d22 3 \u4e2a\u542b\u6709 O \u6216 F \u7684 MXene \u7ed3\u6784\uff0c\u5e76\u8fd4\u56de\u6587\u4ef6URL",
"success_criteria": [
"\u5168\u90e8\u7ed3\u679c\u4e3a MXene",
"\u7b5b\u9009\u51fa\u7684\u7ed3\u679c\u8868\u9762\u7ec8\u6b62\u57fa\u56e2\u4e3a O \u6216 F",
- "CIF \u6587\u4ef6\u5b8c\u6574\u5bfc\u51fa",
+ "URL\u5b8c\u6574\u5bfc\u51fa",
"\u7ed3\u679c\u6570\u91cf\u6b63\u786e"
]
},
{
"expected_outcomes": [
- "\u627e\u5230\u4e8c\u7ef4\u5c42\u72b6\u6750\u6599",
+ "\u627e\u5230\u5c42\u72b6\u6750\u6599",
"\u7b5b\u9009\u5e26\u9699 <1 eV \u7684\u5b50\u96c6",
- "\u6210\u529f\u5bfc\u51fa CIF \u6587\u4ef6"
+ "\u6210\u529f\u8fd4\u56deURL"
],
- "initial_question": "\u68c0\u7d22\u5e76\u7b5b\u9009\u7981\u5e26\u5bbd\u5ea6\u5c0f\u4e8e 1 eV \u7684\u4e8c\u7ef4\u5c42\u72b6\u6750\u6599\uff0c\u8fd4\u56de 5 \u4e2a\u7ed3\u679c",
+ "initial_question": "\u68c0\u7d22\u7981\u5e26\u5bbd\u5ea6\u5c0f\u4e8e 1 eV \u7684\u5c42\u72b6\u6750\u6599\uff0c\u8fd4\u56de 5 \u4e2a\u7ed3\u679c",
"success_criteria": [
- "\u7ed3\u679c\u5747\u4e3a\u4e8c\u7ef4\u5c42\u72b6\u6750\u6599",
+ "\u7ed3\u679c\u5747\u4e3a\u5c42\u72b6\u6750\u6599",
"\u5e26\u9699 <1 eV",
- "CIF \u6587\u4ef6\u5b8c\u6574\u5bfc\u51fa",
+ "\u6587\u4ef6URL\u5b8c\u6574\u5bfc\u51fa",
"\u7ed3\u679c\u6570\u91cf\u6b63\u786e"
]
}
diff --git a/evaluate/experiments/threads/database_search/database_search_bash.py b/evaluate/experiments/threads/database_search/database_search_bash.py
new file mode 100644
index 00000000..70c4f715
--- /dev/null
+++ b/evaluate/experiments/threads/database_search/database_search_bash.py
@@ -0,0 +1,14 @@
+import asyncio
+import argparse
+from evaluate.base.evaluation import evaluation_threads_single_task
+
+if __name__ == '__main__':
+ # 运行测试
+ print('🚀 人类模拟器启动')
+ print('=' * 50)
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--max_turn_count', type=int, default=10, help='最大对话轮数')
+ parser.add_argument('--item_id', type=int, default=0, help='样本索引')
+ args = parser.parse_args()
+
+ asyncio.run(evaluation_threads_single_task('database_search.json', item_id=args.item_id, max_turn_count=args.max_turn_count))
diff --git a/evaluate/experiments/threads/database_search/run.sh b/evaluate/experiments/threads/database_search/run.sh
new file mode 100755
index 00000000..a6572de6
--- /dev/null
+++ b/evaluate/experiments/threads/database_search/run.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+PYTHON=.venv/bin/python # your .venv
+set -a
+source .env # your .env
+set +a
+
+export PYTHONPATH=/your/matmaster/path/MatMaster:$PYTHONPATH
+export MAX_JOBS=3
+
+TOTAL=$($PYTHON -c "
+import os
+import json
+with open('database_search.json') as f:
+ dataset_json = json.load(f)
+print(len(dataset_json))
+")
+
+echo '总数据量:' $TOTAL
+
+running_jobs=0
+
+for ((i=0; i<$TOTAL; i++)); do
+ echo "🚀 提交任务: item $i"
+ sleep 3
+ $PYTHON database_search_bash.py \
+ --item_id $i > item_$i.log 2>&1 &
+
+ ((running_jobs++))
+
+ # 如果正在运行的任务数达到上限,就等待任意一个完成
+ if (( running_jobs >= MAX_JOBS )); then
+ wait -n
+ ((running_jobs--))
+ fi
+done
+
+# 等待最后一批任务
+wait
+echo "✅ 所有任务完成"
diff --git a/evaluate/experiments/threads/structure_generate/run.sh b/evaluate/experiments/threads/structure_generate/run.sh
new file mode 100755
index 00000000..7f5e39bb
--- /dev/null
+++ b/evaluate/experiments/threads/structure_generate/run.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+PYTHON=.venv/bin/python # your .venv
+set -a
+source .env # your .env
+set +a
+
+export PYTHONPATH=/your/matmaster/path/MatMaster:$PYTHONPATH
+export MAX_JOBS=3
+
+TOTAL=$($PYTHON -c "
+import os
+import json
+with open('structure_generate.json') as f:
+ dataset_json = json.load(f)
+print(len(dataset_json))
+")
+
+echo '总数据量:' $TOTAL
+
+running_jobs=0
+
+for ((i=0; i<$TOTAL; i++)); do
+ echo "🚀 提交任务: item $i"
+ sleep 3
+ $PYTHON structure_generate_bash.py \
+ --item_id $i > item_$i.log 2>&1 &
+
+ ((running_jobs++))
+
+ # 如果正在运行的任务数达到上限,就等待任意一个完成
+ if (( running_jobs >= MAX_JOBS )); then
+ wait -n
+ ((running_jobs--))
+ fi
+done
+
+# 等待最后一批任务
+wait
+echo "✅ 所有任务完成"
diff --git a/evaluate/experiments/threads/structure_generate/structure_generate.json b/evaluate/experiments/threads/structure_generate/structure_generate.json
index a4469b34..8f014835 100644
--- a/evaluate/experiments/threads/structure_generate/structure_generate.json
+++ b/evaluate/experiments/threads/structure_generate/structure_generate.json
@@ -88,19 +88,6 @@
"CIF \u6587\u4ef6\u5bfc\u51fa\u5b8c\u6574"
]
},
- {
- "expected_outcomes": [
- "\u63a2\u7d22 Al\u2013Cu\u2013Fe \u4f53\u7cfb\u4e2d\u53ef\u80fd\u7684\u51c6\u6676\u6216\u51c6\u5468\u671f\u6784\u578b\u5019\u9009",
- "\u5bf9\u5019\u9009\u7ed3\u6784\u8fdb\u884c\u5bf9\u79f0\u6027\u5206\u6790\uff08\u5982\u65cb\u8f6c\u5bf9\u79f0\u3001\u957f\u7a0b\u6709\u5e8f/\u65e0\u5468\u671f\u6027\u7279\u5f81\uff09",
- "\u63d0\u4f9b\u53ef\u89c6\u5316/\u63cf\u8ff0\u6027\u5206\u6790\u7ed3\u679c\u548c\u7ed3\u6784\u6587\u4ef6"
- ],
- "initial_question": "\u9884\u6d4b Al\u2013Cu\u2013Fe \u4e09\u5143\u4f53\u7cfb\u53ef\u80fd\u5f62\u6210\u7684\u51c6\u6676\u7ed3\u6784\uff0c\u5e76\u5206\u6790\u5176\u5bf9\u79f0\u6027\u7279\u5f81",
- "success_criteria": [
- "\u751f\u6210\u6216\u8bc6\u522b\u82e5\u5e72\u53ef\u80fd\u7684\u51c6\u6676\u5019\u9009\u7ed3\u6784\uff08\u6216\u7ed9\u51fa\u4e0d\u5b58\u5728\u7684\u7ed3\u8bba\uff09",
- "\u5b8c\u6210\u5bf9\u79f0\u6027\u7279\u5f81\u7684\u5b9a\u6027\u4e0e\u91cf\u5316\u5206\u6790\uff08\u4f8b\u5982\u5b58\u5728\u7684\u65cb\u8f6c\u5bf9\u79f0\u9636\u6570\u3001\u7f3a\u4e4f\u5e73\u79fb\u5468\u671f\u6027\u6216\u8fd1\u4f3c\u8d85\u6676\u683c\u63cf\u8ff0\uff09",
- "\u63d0\u4f9b\u53ef\u5bfc\u51fa\u7684\u7ed3\u6784\u8868\u793a\u6216\u793a\u610f\u6587\u4ef6"
- ]
- },
{
"expected_outcomes": [
"\u751f\u6210 Li\u2013S \u591a\u786b\u5316\u7269\u7684\u5019\u9009\u7ed3\u6784\u96c6\u5408",
diff --git a/evaluate/experiments/threads/structure_generate/structure_generate_bash.py b/evaluate/experiments/threads/structure_generate/structure_generate_bash.py
new file mode 100644
index 00000000..a09fc0b1
--- /dev/null
+++ b/evaluate/experiments/threads/structure_generate/structure_generate_bash.py
@@ -0,0 +1,14 @@
+import asyncio
+import argparse
+from evaluate.base.evaluation import evaluation_threads_single_task
+
+if __name__ == '__main__':
+ # 运行测试
+ print('🚀 人类模拟器启动')
+ print('=' * 50)
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--max_turn_count', type=int, default=20, help='最大对话轮数')
+ parser.add_argument('--item_id', type=int, default=0, help='样本索引')
+ args = parser.parse_args()
+
+ asyncio.run(evaluation_threads_single_task('structure_generate.json', item_id=args.item_id, max_turn_count=args.max_turn_count))