Skip to content

Commit 8084020

Browse files
committed
modify SGI-Bench add idea generation part
1 parent 3521713 commit 8084020

9 files changed

Lines changed: 1676 additions & 326 deletions

File tree

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,5 @@ math-verify
5252
# wrapt_timeout_decorator
5353
## chemBench
5454
loguru
55+
## SGI Idea Generation
56+
sentence_transformers

scieval/dataset/SGI_Bench_1_0/deep_research.py

Lines changed: 39 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,11 @@
11
from typing import Any, Dict, List
22
from datasets import load_dataset
3-
from ...smp import *
3+
import pandas as pd
4+
from ..utils.judge_util import build_judge
5+
from ...utils.mp_util import track_progress_rich
46
from ..text_base import TextBaseDataset
5-
from ..utils.judge_util import *
67
from ...smp.file import dump, load, get_intermediate_file_path
7-
from openai import OpenAI
8-
import concurrent.futures
9-
from tqdm import tqdm
108
from json_repair import repair_json
11-
import os
129

1310

1411
def extract_final_answer(answer_with_thinking: str, start_tag='<answer>', end_tag='</answer>'):
@@ -21,62 +18,11 @@ def extract_final_answer(answer_with_thinking: str, start_tag='<answer>', end_ta
2118
return None
2219

2320

24-
class LLM:
25-
def __init__(self, model='gpt-4.1', **kwargs):
26-
self.api_key = kwargs.get('api_key', os.environ.get('OPENAI_API_KEY')) # export OPENAI_API_KEY="xxxxx"
27-
self.base_url = kwargs.get('base_url', os.environ.get('OPENAI_API_BASE'))
28-
self.base_url = self.base_url[:-17] # export OPENAI_BASE_URL="xxxxx"
29-
self.model = model
30-
if not self.api_key:
31-
raise ValueError("API key is required.")
32-
self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
33-
34-
def __call__(self, query=None, **kwargs):
35-
system_prompt = kwargs.get('system_prompt', 'You are a helpful assistant.')
36-
max_tokens = kwargs.get('max_tokens', None)
37-
temperature = kwargs.get('temperature', 0)
38-
39-
messages = [
40-
{"role": "system", "content": system_prompt},
41-
{"role": "user", "content": query},
42-
]
43-
44-
response = self.client.chat.completions.create(
45-
model=self.model,
46-
messages=messages,
47-
max_tokens=max_tokens,
48-
temperature=temperature,
49-
)
50-
assistant_response = response.choices[0].message.content
51-
return assistant_response
52-
53-
54-
def multi_process(inp_list, function, max_workers=40):
55-
results = [None] * len(inp_list)
56-
with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
57-
future_to_index = {
58-
executor.submit(function, **item): index
59-
for index, item in enumerate(inp_list)
60-
}
61-
62-
for future in tqdm(concurrent.futures.as_completed(future_to_index), total=len(future_to_index)):
63-
index = future_to_index[future]
64-
try:
65-
result = future.result()
66-
results[index] = result
67-
except Exception as e:
68-
print(f"Error processing item {inp_list[index]}: {str(e)}")
69-
70-
return results
71-
72-
73-
judge = LLM('o4-mini')
74-
75-
76-
def eval_model_output(ques_dict):
21+
def eval_model_output(ques_dict, judge_kwargs):
7722
newline = '\n'
7823
prompt = f"""
7924
You are an expert in systematically validating and evaluating LLM-generated solutions. Your task is to rigorously analyze the correctness of a provided solution by comparing it step-by-step against the reference solution, and output **only** a structured verification list—with no additional text.
25+
8026
## Instructions
8127
1. Break down the given LLM solution into individual steps and evaluate each one against the corresponding reference solution steps.
8228
2. For each step, include the following three components:
@@ -87,16 +33,22 @@ def eval_model_output(ques_dict):
8733
4. Justify your judgments rigorously, pointing out even minor inaccuracies or logical flaws.
8834
5. Do not attempt to answer the original question—your role is strictly to evaluate.
8935
6. Output **only** a list of dictionaries in the exact format provided below. Do not include any other text or comments.
36+
9037
## Question
9138
{ques_dict['question']}
39+
9240
## Reference Solution Steps
9341
{newline.join(ques_dict['steps'])}
42+
9443
## Reference Answer
9544
{ques_dict['answer']}
45+
9646
## LLM Solution Steps
9747
{ques_dict['prediction']}
48+
9849
## LLM Answer
9950
{extract_final_answer(ques_dict['prediction'])}
51+
10052
## Output Example
10153
[
10254
{{"solution_step": "step content", "reason": "reason of the judgement", "judge": "correct or incorrect"}},
@@ -105,7 +57,12 @@ def eval_model_output(ques_dict):
10557
"""
10658

10759
try:
108-
llm_judge = judge(prompt)
60+
messages = [
61+
{"role": "system", "value": "You are a helpful assistant.", "type": "text"},
62+
{"role": "user", "value": prompt, "type": "text"},
63+
]
64+
judge = build_judge(**judge_kwargs)
65+
llm_judge = judge.generate(messages)
10966
start_index = llm_judge.find('[')
11067
end_index = llm_judge.rfind(']') + 1
11168
llm_judge = eval(repair_json(llm_judge[start_index:end_index]))
@@ -114,12 +71,13 @@ def eval_model_output(ques_dict):
11471
if step["judge"] == "correct":
11572
correct_step_count += 1
11673
step_level_acc = correct_step_count / len(llm_judge)
117-
except:
74+
except Exception as e:
75+
print(e)
11876
llm_judge = None
11977

12078
ques_dict['exact_match'] = 1 if (
121-
ques_dict['answer'] == ques_dict['prediction'] or ques_dict['answer'] == extract_final_answer(
122-
ques_dict['prediction'])) else 0
79+
ques_dict['answer'] == ques_dict['prediction'] or ques_dict['answer'] == extract_final_answer(
80+
ques_dict['prediction'])) else 0
12381
ques_dict['llm_judge'] = llm_judge
12482
ques_dict['step_level_acc'] = step_level_acc
12583
return ques_dict
@@ -133,7 +91,7 @@ def supported_datasets(cls):
13391
return ["SGI-DeepResearch"]
13492

13593
def load_data(self, dataset):
136-
hf = load_dataset("InternScience/SGI-DeepResearch",split="test")
94+
hf = load_dataset("InternScience/SGI-DeepResearch", split="test")
13795

13896
rows: List[Dict[str, Any]] = []
13997
idx = 0
@@ -153,12 +111,12 @@ def load_data(self, dataset):
153111
idx += 1
154112
return pd.DataFrame(rows)
155113

156-
157114
def build_prompt(self, line):
158115
if isinstance(line, int):
159116
line = self.data.iloc[line]
160117
question = line['question'] + """
161118
You can reason step by step before giving the final answer. The final answer should be enclosed by <answer> and </answer>.
119+
162120
Example:
163121
Step 1. ...
164122
Step 2. ...
@@ -173,8 +131,22 @@ def evaluate(self, eval_file, **judge_kwargs):
173131
data = load(eval_file)
174132
data = pd.DataFrame(data)
175133

176-
inp_list = [{"ques_dict": item} for item in data.to_dict(orient="records")]
177-
out_list = multi_process(inp_list, eval_model_output, 48)
134+
if judge_kwargs.get('model') is None:
135+
judge_kwargs['model'] = 'o4-mini'
136+
if judge_kwargs.get('max_tokens') is None:
137+
judge_kwargs['max_tokens'] = None
138+
139+
inp_list = []
140+
for item in data.to_dict(orient="records"):
141+
inp_list.append({
142+
"ques_dict": item,
143+
"judge_kwargs": judge_kwargs
144+
})
145+
out_list = track_progress_rich(
146+
func=eval_model_output,
147+
tasks=inp_list,
148+
nproc=judge_kwargs.get('nproc', 48)
149+
)
178150

179151
exact_match = sum([item['exact_match'] for item in out_list]) / len(out_list)
180152
step_level_acc = sum([item['step_level_acc'] for item in out_list]) / len(out_list)

0 commit comments

Comments
 (0)