@@ -115,175 +115,201 @@ def multi_turn_evaluation_task(dataset_item):
115115 return result
116116
117117
118- async def evaluation_threads_task ( file_path , max_turn_count = 10 ):
119- """与ADK agent进行多轮对话测试"""
120- print ( '=' * 80 )
121- print ( '🤖 与ADK Agent多轮对话测试' )
122- print ( '=' * 80 )
118+ import asyncio
119+ import json
120+ import re
121+ import time
122+ from typing import Dict , Any , List
123123
124- dataset_json = json .loads (load_dataset_json (file_path ))
125- eval_results = []
126- for index , dataset_item in enumerate (dataset_json ):
127- time .sleep (10 ) # 避免请求过于频繁
128- session_service = InMemorySessionService ()
129- session = await session_service .create_session (
130- app_name = 'matmaster_agent' ,
131- user_id = 'human_simulator_test' ,
124+
125+ async def _run_conversation (dataset_item : Dict [str , Any ], max_turn_count : int , save_mode : str = 'w' ) -> Dict [str , Any ]:
126+ """
127+ 执行一次对话测试,并返回结果
128+ :param dataset_item: 单条测试数据
129+ :param max_turn_count: 最大对话轮次
130+ :param save_mode: 写文件模式 ("w" 覆盖 / "a" 追加)
131+ """
132+ session_service = InMemorySessionService ()
133+ session = await session_service .create_session (
134+ app_name = 'matmaster_agent' ,
135+ user_id = 'human_simulator_test' ,
136+ )
137+
138+ logger .info (f"Test Session: { session .id } " )
139+
140+ runner = Runner (
141+ app_name = 'matmaster_agent' ,
142+ agent = root_agent ,
143+ session_service = session_service
144+ )
145+
146+ simulator = HumanSimulator (max_turn_count = max_turn_count )
147+
148+ # 场景初始化
149+ scenario = {
150+ 'name' : dataset_item ['initial_question' ],
151+ 'goal' : ConversationGoal (
152+ initial_question = dataset_item ['initial_question' ],
153+ expected_outcomes = dataset_item ['expected_outcomes' ],
154+ success_criteria = dataset_item ['success_criteria' ]
132155 )
156+ }
157+ print (f"\n { '=' * 20 } 测试场景: { scenario ['name' ]} { '=' * 20 } " )
133158
134- logger .info (f"Test Session: { session .id } " )
159+ simulator .set_goal (scenario ['goal' ])
160+ initial_question = simulator .get_initial_question ()
135161
136- runner = Runner (
137- app_name = 'matmaster_agent' ,
138- agent = root_agent ,
139- session_service = session_service
140- )
162+ print (f"🎯 对话目标: { initial_question } " )
163+ print (f"📋 期望结果: { ', ' .join (scenario ['goal' ].expected_outcomes )} " )
164+ print (f"✅ 成功标准: { ', ' .join (scenario ['goal' ].success_criteria )} " )
165+
166+ # 初始化结果
167+ eval_results = {
168+ 'initial_question' : initial_question ,
169+ 'expected_outcomes' : scenario ['goal' ].expected_outcomes ,
170+ 'success_criteria' : scenario ['goal' ].success_criteria ,
171+ }
172+ for i in range (1 , max_turn_count + 1 ):
173+ eval_results [f'agent_response_{ i } ' ] = ''
174+ eval_results [f'user_response_{ i } ' ] = ''
175+
176+ # 对话循环
177+ turn_count = 0
178+ while turn_count < max_turn_count :
179+ turn_count += 1
180+ print (f"\n 🔄 第 { turn_count } 轮对话:" )
181+
182+ # 获取用户输入
183+ user_input = initial_question if turn_count == 1 else simulator .get_last_user_response ()
184+ print (f"🧑 模拟用户: { user_input } " )
185+
186+ # 调用 agent
187+ try :
188+ content = types .Content (role = 'user' , parts = [types .Part (text = user_input )])
189+ agent_response = ''
190+
191+ events = runner .run_async (
192+ user_id = session .user_id ,
193+ session_id = session .id ,
194+ new_message = content ,
195+ run_config = RunConfig (streaming_mode = StreamingMode .SSE )
196+ )
197+
198+ async for event in events :
199+ if event .content and event .content .parts :
200+ for part in event .content .parts :
201+ if part .text :
202+ agent_response += part .text
203+ except asyncio .CancelledError :
204+ msg = '任务被取消,可能是超时或作用域取消导致'
205+ logger .error (msg )
206+ eval_results [f'agent_response_{ turn_count } ' ] = msg
207+ break
208+ except Exception as e :
209+ logger .error (f"获取agent响应失败: { e } " )
210+ eval_results [f'agent_response_{ turn_count } ' ] = str (e )
211+ break
141212
142- # 创建人类模拟器
143- simulator = HumanSimulator (max_turn_count = max_turn_count )
144-
145- # 数据预处理
146- scenario = {
147- 'name' : dataset_item ['initial_question' ],
148- 'goal' : ConversationGoal (
149- initial_question = dataset_item ['initial_question' ],
150- expected_outcomes = dataset_item ['expected_outcomes' ],
151- success_criteria = dataset_item ['success_criteria' ]
152- )}
153-
154- print (f"\n { '=' * 20 } 测试场景: { scenario ['name' ]} { '=' * 20 } " )
155-
156- # 设置对话目标
157- simulator .set_goal (scenario ['goal' ])
158- initial_question = simulator .get_initial_question ()
159-
160- print (f"🎯 对话目标: { initial_question } " )
161- print (f"📋 期望结果: { ', ' .join (scenario ['goal' ].expected_outcomes )} " )
162- print (f"✅ 成功标准: { ', ' .join (scenario ['goal' ].success_criteria )} " )
163-
164- # 初始化记录
165- eval_results .append ({})
166- eval_results [index ]['initial_question' ] = initial_question
167- eval_results [index ]['expected_outcomes' ] = scenario ['goal' ].expected_outcomes
168- eval_results [index ]['success_criteria' ] = scenario ['goal' ].success_criteria
169- for i in range (1 , 6 ):
170- eval_results [index ][f'agent_response_{ i } ' ] = ''
171- eval_results [index ][f'user_response_{ i } ' ] = ''
172-
173- # 开始对话
174- conversation_ended = False
175- turn_count = 0
176-
177- while not conversation_ended and turn_count < 10 :
178- turn_count += 1
179- print (f"\n 🔄 第 { turn_count } 轮对话:" )
180-
181- # 获取用户输入(从模拟器)
182- if turn_count == 1 :
183- user_input = initial_question
184- else :
185- # 从模拟器获取响应
186- user_input = simulator .get_last_user_response ()
187-
188- print (f"🧑 模拟用户: { user_input } " )
189-
190- # 调用ADK agent
213+ eval_results [f'agent_response_{ turn_count } ' ] = agent_response
214+ print (f"🤖 ADK Agent: { agent_response } " )
215+
216+ # 提取 job_id
217+ job_jsons = re .findall (r'<bohrium-chat-msg>(.*?)</bohrium-chat-msg>' , agent_response )
218+ job_ids : List [str ] = []
219+ for job_json in job_jsons :
191220 try :
192- content = types .Content (role = 'user' , parts = [types .Part (text = user_input )])
193-
194- agent_response = ''
195-
196- events = runner .run_async (
197- user_id = session .user_id ,
198- session_id = session .id ,
199- new_message = content ,
200- run_config = RunConfig (streaming_mode = StreamingMode .SSE )
201- )
202-
203- # 收集agent响应
204- async for event in events :
205- if event .content and event .content .parts :
206- for part in event .content .parts :
207- if part .text :
208- agent_response += part .text
209- except asyncio .CancelledError :
210- logger .error ('任务被取消,可能是超时或作用域取消导致' )
211- print (f"✅ 对话在第{ turn_count } 轮结束" )
212- eval_results [index ][f'agent_response_{ turn_count } ' ] = '任务被取消,可能是超时或作用域取消导致'
213- break
221+ job_json = json .loads (job_json )
222+ if 'eventData' in job_json and 'content' in job_json ['eventData' ]:
223+ content = job_json ['eventData' ]['content' ]
224+ if 'job_list' in content and 'job_id' in content ['job_list' ]:
225+ job_ids .append (content ['job_list' ]['job_id' ])
214226 except Exception as e :
215- logger .error (f"获取agent响应失败: { e } " )
216- print (f"✅ 对话在第{ turn_count } 轮结束" )
217- eval_results [index ][f'agent_response_{ turn_count } ' ] = str (e )
218- break
219-
220- eval_results [index ][f'agent_response_{ turn_count } ' ] = agent_response
221- print (f"🤖 ADK Agent: { agent_response } " )
222-
223- job_jsons = re .findall (r'<bohrium-chat-msg>(.*?)</bohrium-chat-msg>' , agent_response )
224- job_ids = []
225- if job_jsons :
226- for job_json in job_jsons :
227- try :
228- job_json = json .loads (job_json )
229- if 'eventData' in job_json and 'content' in job_json ['eventData' ]:
230- content = job_json ['eventData' ]['content' ]
231- if 'job_list' in content and 'job_id' in content ['job_list' ]:
232- job_id = content ['job_list' ]['job_id' ]
233- job_ids .append (job_id )
234- except Exception as e :
235- logger .error (f"提取job_id失败: { e } " )
236-
237- # 查询job状态
238- if job_ids :
239- job_ids = list (set (job_ids ))
240- while True :
241- time .sleep (10 )
242- all_finished = True
243- for job_id in job_ids :
244- bohrium_client = Bohrium ()
245- job_info = bohrium_client .job .detail (job_id )
246- logger .info (f"查询到job状态: { job_id } - 状态: { job_info ["status" ]} " )
247- if job_info ['status' ] not in [- 1 , 2 ]:
248- all_finished = False
249- if all_finished :
250- break
251-
252- # 使用模拟器生成用户响应
253- user_response , should_continue = simulator .get_bohr_results (agent_response , job_ids )
254- eval_results [index ][f'user_response_{ turn_count } ' ] = user_response
255- print (f"🧑 模拟用户: { user_response } " )
256- else :
257- # 使用模拟器生成用户响应
258- user_response , should_continue = simulator .generate_response (agent_response )
259- eval_results [index ][f'user_response_{ turn_count } ' ] = user_response
260- print (f"🧑 模拟用户: { user_response } " )
261-
262- if not should_continue :
263- print (f"✅ 对话在第{ turn_count } 轮结束" )
264- break
265-
266- # 获取对话摘要
267- summary = simulator .get_conversation_summary ()
268- eval_results [index ]['total_turns' ] = summary ['total_turns' ]
269- eval_results [index ]['final_state' ] = summary ['final_state' ]
270- eval_results [index ]['duration_minutes' ] = summary ['duration_minutes' ]
271- print (f"\n 📊 对话摘要:" )
272- print (f" - 总轮次: { summary ['total_turns' ]} " )
273- print (f" - 最终状态: { summary ['final_state' ]} " )
274- print (f" - 耗时: { summary ['duration_minutes' ]:.1f} 分钟" )
275-
276- with open ('evaluation_results.json' , 'w' ) as f :
277- json .dump (eval_results , f , indent = 4 , ensure_ascii = False )
278-
279- # 简单的成功判断
280- if summary ['final_state' ] == 'satisfied' :
281- print ('✅ 测试通过: 对话成功完成' )
227+ logger .error (f"提取job_id失败: { e } " )
228+
229+ # 查询 job 状态
230+ if job_ids :
231+ job_ids = list (set (job_ids ))
232+ while True :
233+ time .sleep (10 )
234+ all_finished = True
235+ for job_id in job_ids :
236+ bohrium_client = Bohrium ()
237+ job_info = bohrium_client .job .detail (job_id )
238+ logger .info (f"查询到job状态: { job_id } - 状态: { job_info ['status' ]} " )
239+ if job_info ['status' ] not in [- 1 , 2 ]:
240+ all_finished = False
241+ if all_finished :
242+ break
243+
244+ user_response , should_continue = simulator .get_bohr_results (agent_response , job_ids )
282245 else :
283- print ('❌ 测试失败: 对话未成功完成' )
246+ user_response , should_continue = simulator .generate_response (agent_response )
247+
248+ eval_results [f'user_response_{ turn_count } ' ] = user_response
249+ print (f"🧑 模拟用户: { user_response } " )
284250
285- await runner .close ()
251+ if not should_continue :
252+ print (f"✅ 对话在第{ turn_count } 轮结束" )
253+ break
254+
255+ # 对话总结
256+ summary = simulator .get_conversation_summary ()
257+ eval_results .update ({
258+ 'total_turns' : summary ['total_turns' ],
259+ 'final_state' : summary ['final_state' ],
260+ 'duration_minutes' : summary ['duration_minutes' ],
261+ })
262+
263+ print (f"\n 📊 对话摘要:" )
264+ print (f" - 总轮次: { summary ['total_turns' ]} " )
265+ print (f" - 最终状态: { summary ['final_state' ]} " )
266+ print (f" - 耗时: { summary ['duration_minutes' ]:.1f} 分钟" )
267+
268+ # 保存结果
269+ with open ('evaluation_results.json' , save_mode ) as f :
270+ json .dump (eval_results , f , indent = 4 , ensure_ascii = False )
271+
272+ if summary ['final_state' ] == 'satisfied' :
273+ print ('✅ 测试通过: 对话成功完成' )
274+ else :
275+ print ('❌ 测试失败: 对话未成功完成' )
276+
277+ await runner .close ()
278+ return eval_results
279+
280+
281+ async def evaluation_threads_task (file_path : str , max_turn_count : int = 10 ):
282+ """批量测试所有数据"""
283+ print ('=' * 80 )
284+ print ('🤖 与ADK Agent多轮对话测试' )
285+ print ('=' * 80 )
286+
287+ dataset_json = json .loads (load_dataset_json (file_path ))
288+ results = []
289+ for i , dataset_item in enumerate (dataset_json ):
290+ time .sleep (10 ) # 避免请求过于频繁
291+ result = await _run_conversation (dataset_item , max_turn_count , save_mode = 'w' if i == 0 else 'a' )
292+ results .append (result )
286293
287294 print ('\n ' + '=' * 80 )
288295 print ('🎉 多轮对话测试完成!' )
289296 print ('=' * 80 )
297+ return results
298+
299+
300+ async def evaluation_threads_single_task (file_path : str , item_id : int , max_turn_count : int = 10 ):
301+ """测试单个数据"""
302+ print ('=' * 80 )
303+ print ('🤖 与ADK Agent多轮对话测试' )
304+ print ('=' * 80 )
305+
306+ dataset_json = json .loads (load_dataset_json (file_path ))
307+ dataset_item = dataset_json [item_id ]
308+ time .sleep (10 ) # 避免请求过于频繁
309+
310+ result = await _run_conversation (dataset_item , max_turn_count , save_mode = 'a' )
311+
312+ print ('\n ' + '=' * 80 )
313+ print ('🎉 单条多轮对话测试完成!' )
314+ print ('=' * 80 )
315+ return result
0 commit comments