Skip to content

Commit 486636c

Browse files
committed
feat: introduce evaluation module and add structure generation and answer quality experiments
1 parent d9bf3f8 commit 486636c

19 files changed

Lines changed: 238 additions & 650 deletions

evaluate/base/__init__.py

Whitespace-only changes.
Lines changed: 2 additions & 211 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,6 @@
44
import re
55
import time
66
import uuid
7-
from dataclasses import dataclass
8-
from enum import Enum
9-
from typing import List, Dict, Any, Optional, Tuple
107

118
from bohrium import Bohrium
129
from dotenv import load_dotenv, find_dotenv
@@ -15,11 +12,11 @@
1512
from google.adk.agents.run_config import StreamingMode
1613
from google.adk.sessions import InMemorySessionService
1714
from google.genai import types
18-
from litellm import completion
1915

2016
from agents.matmaster_agent.agent import root_agent
2117
from agents.matmaster_agent.constant import MATMASTER_AGENT_NAME
2218
from agents.matmaster_agent.utils.event_utils import is_function_call
19+
from evaluate.base.human_simulator import ConversationGoal, HumanSimulator
2320
from evaluate.utils import load_dataset_json
2421

2522
logger = logging.getLogger(__name__)
@@ -118,213 +115,7 @@ def multi_turn_evaluation_task(dataset_item):
118115
return result
119116

120117

121-
class ConversationState(Enum):
122-
"""对话状态枚举"""
123-
INITIAL = 'initial'
124-
IN_PROGRESS = 'in_progress'
125-
SATISFIED = 'satisfied'
126-
TIMEOUT = 'timeout'
127-
128-
129-
@dataclass
130-
class ConversationGoal:
131-
"""对话目标定义"""
132-
initial_question: str
133-
expected_outcomes: List[str]
134-
success_criteria: List[str]
135-
136-
137-
class HumanSimulator:
138-
"""
139-
简化的人类模拟器 - 用于多轮对话agent评估
140-
141-
功能:
142-
1. 模拟真实用户行为
143-
2. 管理对话目标
144-
3. 生成上下文相关的响应
145-
4. 限制最多10轮对话
146-
"""
147-
148-
def __init__(self, model: str = 'deepseek/deepseek-chat'):
149-
self.model = model
150-
self.conversation_history: List[Dict[str, Any]] = []
151-
self.current_state = ConversationState.INITIAL
152-
self.turn_count = 0
153-
self.start_time = None
154-
self.goal: Optional[ConversationGoal] = None
155-
156-
def set_goal(self, goal: ConversationGoal):
157-
"""设置对话目标"""
158-
self.goal = goal
159-
self.current_state = ConversationState.INITIAL
160-
self.turn_count = 0
161-
self.start_time = time.time()
162-
logger.info(f"设置对话目标: {goal.initial_question}")
163-
164-
def get_initial_question(self) -> str:
165-
"""获取初始问题"""
166-
if not self.goal:
167-
raise ValueError('未设置对话目标')
168-
return self.goal.initial_question
169-
170-
def generate_response(self, agent_message: str) -> Tuple[str, bool]:
171-
"""
172-
基于agent的回复生成模拟用户的响应
173-
174-
Args:
175-
agent_message: agent的回复内容
176-
177-
Returns:
178-
Tuple[str, bool]: (用户响应, 是否继续对话)
179-
"""
180-
if not self.goal:
181-
raise ValueError('未设置对话目标')
182-
183-
self.turn_count += 1
184-
self.conversation_history.append({
185-
'turn': self.turn_count,
186-
'agent': agent_message,
187-
'timestamp': time.time()
188-
})
189-
190-
# 检查是否达到最大轮次(限制为10轮)
191-
if self.turn_count >= 10:
192-
self.current_state = ConversationState.TIMEOUT
193-
return '我们已经聊了10轮了,我想结束这个对话。', False
194-
195-
# 生成用户响应
196-
user_response, should_continue = self._generate_user_response(agent_message)
197-
198-
# 更新对话状态
199-
if not should_continue:
200-
self.current_state = ConversationState.SATISFIED
201-
202-
self.conversation_history.append({
203-
'turn': self.turn_count,
204-
'user': user_response,
205-
'timestamp': time.time()
206-
})
207-
208-
return user_response, should_continue
209-
210-
def _generate_user_response(self, agent_message: str) -> Tuple[str, bool]:
211-
"""生成用户响应的核心逻辑"""
212-
213-
prompt = self._build_response_prompt(agent_message)
214-
215-
try:
216-
response = completion(
217-
model=self.model,
218-
messages=[{'role': 'user', 'content': prompt}],
219-
temperature=0.7
220-
)
221-
222-
result = json.loads(response.choices[0].message.content)
223-
user_response = result.get('response', '我理解了。')
224-
should_continue = result.get('continue', True)
225-
226-
logger.info(f"用户响应生成 - 轮次: {self.turn_count}, 继续: {should_continue}")
227-
228-
return user_response, should_continue
229-
except Exception as e:
230-
logger.error(f"生成用户响应失败: {e}")
231-
return '我理解了,请继续。', True
232-
233-
def _build_response_prompt(self, agent_message: str) -> str:
234-
"""构建生成用户响应的提示词"""
235-
236-
return f"""
237-
你是一个模拟用户,正在与一个材料计算AI agent进行多轮对话。请基于以下信息生成合适的响应:
238-
239-
对话目标:
240-
- 初始问题: {self.goal.initial_question}
241-
- 期望结果: {', '.join(self.goal.expected_outcomes)}
242-
- 成功标准: {', '.join(self.goal.success_criteria)}
243-
244-
当前状态:
245-
- 对话轮次: {self.turn_count}/10
246-
247-
Agent最新回复:
248-
{agent_message}
249-
250-
请分析agent的回复是否满足任务需求,并生成合适的响应。
251-
252-
重要限制:
253-
- 对话最多10轮,当前是第{self.turn_count}
254-
- 除首轮对话外,其他轮次尽可能简短地回答agent的问题,回复内容紧扣初始问题,禁止发散
255-
- 如果agent在询问具体参数或设置,提供简洁明确的回答
256-
- 如果agent已经提供了初始任务所需的信息或完成了任务,请立刻结束对话
257-
- 禁止回复可能导致agent产生误解或偏离目标的内容
258-
259-
请以JSON格式回复:
260-
{{
261-
"response": "你的回复内容",
262-
"continue": true/false // 是否继续对话
263-
}}
264-
"""
265-
266-
def get_bohr_results(self, agent_message: str, job_id: List[str]) -> Tuple[str, bool]:
267-
"""
268-
基于agent的回复生成模拟用户的响应
269-
270-
Args:
271-
agent_message: agent的回复内容
272-
job_id: job_id
273-
274-
Returns:
275-
Tuple[str, bool]: (用户响应, 是否继续对话)
276-
"""
277-
if not self.goal:
278-
raise ValueError('未设置对话目标')
279-
280-
self.turn_count += 1
281-
self.conversation_history.append({
282-
'turn': self.turn_count,
283-
'agent': agent_message,
284-
'timestamp': time.time()
285-
})
286-
287-
# 生成用户响应
288-
user_response = f'查看id为{job_id}的任务结果'
289-
should_continue = True
290-
291-
# 更新对话状态
292-
if not should_continue:
293-
self.current_state = ConversationState.SATISFIED
294-
295-
self.conversation_history.append({
296-
'turn': self.turn_count,
297-
'user': user_response,
298-
'timestamp': time.time()
299-
})
300-
301-
return user_response, should_continue
302-
303-
def get_conversation_summary(self) -> Dict[str, Any]:
304-
"""获取对话摘要"""
305-
return {
306-
'goal': self.goal.initial_question if self.goal else None,
307-
'total_turns': self.turn_count,
308-
'final_state': self.current_state.value,
309-
'duration_minutes': ((time.time() - self.start_time) / 60) if self.start_time else 0,
310-
'conversation_history': self.conversation_history
311-
}
312-
313-
def get_last_user_response(self) -> str:
314-
"""获取最后的用户响应"""
315-
if not self.conversation_history:
316-
return self.get_initial_question()
317-
318-
# 查找最后一个用户响应
319-
for entry in reversed(self.conversation_history):
320-
if 'user' in entry:
321-
return entry['user']
322-
323-
# 如果没有找到用户响应,返回初始问题
324-
return self.get_initial_question()
325-
326-
327-
async def test_with_adk_agent(file_path):
118+
async def evaluation_threads_task(file_path):
328119
"""与ADK agent进行多轮对话测试"""
329120
print('=' * 80)
330121
print('🤖 与ADK Agent多轮对话测试')

0 commit comments

Comments
 (0)