|
4 | 4 | import re |
5 | 5 | import time |
6 | 6 | import uuid |
7 | | -from dataclasses import dataclass |
8 | | -from enum import Enum |
9 | | -from typing import List, Dict, Any, Optional, Tuple |
10 | 7 |
|
11 | 8 | from bohrium import Bohrium |
12 | 9 | from dotenv import load_dotenv, find_dotenv |
|
15 | 12 | from google.adk.agents.run_config import StreamingMode |
16 | 13 | from google.adk.sessions import InMemorySessionService |
17 | 14 | from google.genai import types |
18 | | -from litellm import completion |
19 | 15 |
|
20 | 16 | from agents.matmaster_agent.agent import root_agent |
21 | 17 | from agents.matmaster_agent.constant import MATMASTER_AGENT_NAME |
22 | 18 | from agents.matmaster_agent.utils.event_utils import is_function_call |
| 19 | +from evaluate.base.human_simulator import ConversationGoal, HumanSimulator |
23 | 20 | from evaluate.utils import load_dataset_json |
24 | 21 |
|
25 | 22 | logger = logging.getLogger(__name__) |
@@ -118,213 +115,7 @@ def multi_turn_evaluation_task(dataset_item): |
118 | 115 | return result |
119 | 116 |
|
120 | 117 |
|
121 | | -class ConversationState(Enum): |
122 | | - """对话状态枚举""" |
123 | | - INITIAL = 'initial' |
124 | | - IN_PROGRESS = 'in_progress' |
125 | | - SATISFIED = 'satisfied' |
126 | | - TIMEOUT = 'timeout' |
127 | | - |
128 | | - |
129 | | -@dataclass |
130 | | -class ConversationGoal: |
131 | | - """对话目标定义""" |
132 | | - initial_question: str |
133 | | - expected_outcomes: List[str] |
134 | | - success_criteria: List[str] |
135 | | - |
136 | | - |
137 | | -class HumanSimulator: |
138 | | - """ |
139 | | - 简化的人类模拟器 - 用于多轮对话agent评估 |
140 | | -
|
141 | | - 功能: |
142 | | - 1. 模拟真实用户行为 |
143 | | - 2. 管理对话目标 |
144 | | - 3. 生成上下文相关的响应 |
145 | | - 4. 限制最多10轮对话 |
146 | | - """ |
147 | | - |
148 | | - def __init__(self, model: str = 'deepseek/deepseek-chat'): |
149 | | - self.model = model |
150 | | - self.conversation_history: List[Dict[str, Any]] = [] |
151 | | - self.current_state = ConversationState.INITIAL |
152 | | - self.turn_count = 0 |
153 | | - self.start_time = None |
154 | | - self.goal: Optional[ConversationGoal] = None |
155 | | - |
156 | | - def set_goal(self, goal: ConversationGoal): |
157 | | - """设置对话目标""" |
158 | | - self.goal = goal |
159 | | - self.current_state = ConversationState.INITIAL |
160 | | - self.turn_count = 0 |
161 | | - self.start_time = time.time() |
162 | | - logger.info(f"设置对话目标: {goal.initial_question}") |
163 | | - |
164 | | - def get_initial_question(self) -> str: |
165 | | - """获取初始问题""" |
166 | | - if not self.goal: |
167 | | - raise ValueError('未设置对话目标') |
168 | | - return self.goal.initial_question |
169 | | - |
170 | | - def generate_response(self, agent_message: str) -> Tuple[str, bool]: |
171 | | - """ |
172 | | - 基于agent的回复生成模拟用户的响应 |
173 | | -
|
174 | | - Args: |
175 | | - agent_message: agent的回复内容 |
176 | | -
|
177 | | - Returns: |
178 | | - Tuple[str, bool]: (用户响应, 是否继续对话) |
179 | | - """ |
180 | | - if not self.goal: |
181 | | - raise ValueError('未设置对话目标') |
182 | | - |
183 | | - self.turn_count += 1 |
184 | | - self.conversation_history.append({ |
185 | | - 'turn': self.turn_count, |
186 | | - 'agent': agent_message, |
187 | | - 'timestamp': time.time() |
188 | | - }) |
189 | | - |
190 | | - # 检查是否达到最大轮次(限制为10轮) |
191 | | - if self.turn_count >= 10: |
192 | | - self.current_state = ConversationState.TIMEOUT |
193 | | - return '我们已经聊了10轮了,我想结束这个对话。', False |
194 | | - |
195 | | - # 生成用户响应 |
196 | | - user_response, should_continue = self._generate_user_response(agent_message) |
197 | | - |
198 | | - # 更新对话状态 |
199 | | - if not should_continue: |
200 | | - self.current_state = ConversationState.SATISFIED |
201 | | - |
202 | | - self.conversation_history.append({ |
203 | | - 'turn': self.turn_count, |
204 | | - 'user': user_response, |
205 | | - 'timestamp': time.time() |
206 | | - }) |
207 | | - |
208 | | - return user_response, should_continue |
209 | | - |
210 | | - def _generate_user_response(self, agent_message: str) -> Tuple[str, bool]: |
211 | | - """生成用户响应的核心逻辑""" |
212 | | - |
213 | | - prompt = self._build_response_prompt(agent_message) |
214 | | - |
215 | | - try: |
216 | | - response = completion( |
217 | | - model=self.model, |
218 | | - messages=[{'role': 'user', 'content': prompt}], |
219 | | - temperature=0.7 |
220 | | - ) |
221 | | - |
222 | | - result = json.loads(response.choices[0].message.content) |
223 | | - user_response = result.get('response', '我理解了。') |
224 | | - should_continue = result.get('continue', True) |
225 | | - |
226 | | - logger.info(f"用户响应生成 - 轮次: {self.turn_count}, 继续: {should_continue}") |
227 | | - |
228 | | - return user_response, should_continue |
229 | | - except Exception as e: |
230 | | - logger.error(f"生成用户响应失败: {e}") |
231 | | - return '我理解了,请继续。', True |
232 | | - |
233 | | - def _build_response_prompt(self, agent_message: str) -> str: |
234 | | - """构建生成用户响应的提示词""" |
235 | | - |
236 | | - return f""" |
237 | | -你是一个模拟用户,正在与一个材料计算AI agent进行多轮对话。请基于以下信息生成合适的响应: |
238 | | -
|
239 | | -对话目标: |
240 | | -- 初始问题: {self.goal.initial_question} |
241 | | -- 期望结果: {', '.join(self.goal.expected_outcomes)} |
242 | | -- 成功标准: {', '.join(self.goal.success_criteria)} |
243 | | -
|
244 | | -当前状态: |
245 | | -- 对话轮次: {self.turn_count}/10 |
246 | | -
|
247 | | -Agent最新回复: |
248 | | -{agent_message} |
249 | | -
|
250 | | -请分析agent的回复是否满足任务需求,并生成合适的响应。 |
251 | | -
|
252 | | -重要限制: |
253 | | -- 对话最多10轮,当前是第{self.turn_count}轮 |
254 | | -- 除首轮对话外,其他轮次尽可能简短地回答agent的问题,回复内容紧扣初始问题,禁止发散 |
255 | | -- 如果agent在询问具体参数或设置,提供简洁明确的回答 |
256 | | -- 如果agent已经提供了初始任务所需的信息或完成了任务,请立刻结束对话 |
257 | | -- 禁止回复可能导致agent产生误解或偏离目标的内容 |
258 | | -
|
259 | | -请以JSON格式回复: |
260 | | -{{ |
261 | | - "response": "你的回复内容", |
262 | | - "continue": true/false // 是否继续对话 |
263 | | -}} |
264 | | -""" |
265 | | - |
266 | | - def get_bohr_results(self, agent_message: str, job_id: List[str]) -> Tuple[str, bool]: |
267 | | - """ |
268 | | - 基于agent的回复生成模拟用户的响应 |
269 | | -
|
270 | | - Args: |
271 | | - agent_message: agent的回复内容 |
272 | | - job_id: job_id |
273 | | -
|
274 | | - Returns: |
275 | | - Tuple[str, bool]: (用户响应, 是否继续对话) |
276 | | - """ |
277 | | - if not self.goal: |
278 | | - raise ValueError('未设置对话目标') |
279 | | - |
280 | | - self.turn_count += 1 |
281 | | - self.conversation_history.append({ |
282 | | - 'turn': self.turn_count, |
283 | | - 'agent': agent_message, |
284 | | - 'timestamp': time.time() |
285 | | - }) |
286 | | - |
287 | | - # 生成用户响应 |
288 | | - user_response = f'查看id为{job_id}的任务结果' |
289 | | - should_continue = True |
290 | | - |
291 | | - # 更新对话状态 |
292 | | - if not should_continue: |
293 | | - self.current_state = ConversationState.SATISFIED |
294 | | - |
295 | | - self.conversation_history.append({ |
296 | | - 'turn': self.turn_count, |
297 | | - 'user': user_response, |
298 | | - 'timestamp': time.time() |
299 | | - }) |
300 | | - |
301 | | - return user_response, should_continue |
302 | | - |
303 | | - def get_conversation_summary(self) -> Dict[str, Any]: |
304 | | - """获取对话摘要""" |
305 | | - return { |
306 | | - 'goal': self.goal.initial_question if self.goal else None, |
307 | | - 'total_turns': self.turn_count, |
308 | | - 'final_state': self.current_state.value, |
309 | | - 'duration_minutes': ((time.time() - self.start_time) / 60) if self.start_time else 0, |
310 | | - 'conversation_history': self.conversation_history |
311 | | - } |
312 | | - |
313 | | - def get_last_user_response(self) -> str: |
314 | | - """获取最后的用户响应""" |
315 | | - if not self.conversation_history: |
316 | | - return self.get_initial_question() |
317 | | - |
318 | | - # 查找最后一个用户响应 |
319 | | - for entry in reversed(self.conversation_history): |
320 | | - if 'user' in entry: |
321 | | - return entry['user'] |
322 | | - |
323 | | - # 如果没有找到用户响应,返回初始问题 |
324 | | - return self.get_initial_question() |
325 | | - |
326 | | - |
327 | | -async def test_with_adk_agent(file_path): |
| 118 | +async def evaluation_threads_task(file_path): |
328 | 119 | """与ADK agent进行多轮对话测试""" |
329 | 120 | print('=' * 80) |
330 | 121 | print('🤖 与ADK Agent多轮对话测试') |
|
0 commit comments