11from typing import Any , Dict , List
22from datasets import load_dataset
3- from ...smp import *
3+ import pandas as pd
4+ from ..utils .judge_util import build_judge
5+ from ...utils .mp_util import track_progress_rich
46from ..text_base import TextBaseDataset
5- from ..utils .judge_util import *
67from ...smp .file import dump , load , get_intermediate_file_path
7- from openai import OpenAI
8- import concurrent .futures
9- from tqdm import tqdm
108from json_repair import repair_json
11- import os
129
1310
1411def extract_final_answer (answer_with_thinking : str , start_tag = '<answer>' , end_tag = '</answer>' ):
@@ -21,62 +18,11 @@ def extract_final_answer(answer_with_thinking: str, start_tag='<answer>', end_ta
2118 return None
2219
2320
24- class LLM :
25- def __init__ (self , model = 'gpt-4.1' , ** kwargs ):
26- self .api_key = kwargs .get ('api_key' , os .environ .get ('OPENAI_API_KEY' )) # export OPENAI_API_KEY="xxxxx"
27- self .base_url = kwargs .get ('base_url' , os .environ .get ('OPENAI_API_BASE' ))
28- self .base_url = self .base_url [:- 17 ] # export OPENAI_BASE_URL="xxxxx"
29- self .model = model
30- if not self .api_key :
31- raise ValueError ("API key is required." )
32- self .client = OpenAI (api_key = self .api_key , base_url = self .base_url )
33-
34- def __call__ (self , query = None , ** kwargs ):
35- system_prompt = kwargs .get ('system_prompt' , 'You are a helpful assistant.' )
36- max_tokens = kwargs .get ('max_tokens' , None )
37- temperature = kwargs .get ('temperature' , 0 )
38-
39- messages = [
40- {"role" : "system" , "content" : system_prompt },
41- {"role" : "user" , "content" : query },
42- ]
43-
44- response = self .client .chat .completions .create (
45- model = self .model ,
46- messages = messages ,
47- max_tokens = max_tokens ,
48- temperature = temperature ,
49- )
50- assistant_response = response .choices [0 ].message .content
51- return assistant_response
52-
53-
54- def multi_process (inp_list , function , max_workers = 40 ):
55- results = [None ] * len (inp_list )
56- with concurrent .futures .ProcessPoolExecutor (max_workers = max_workers ) as executor :
57- future_to_index = {
58- executor .submit (function , ** item ): index
59- for index , item in enumerate (inp_list )
60- }
61-
62- for future in tqdm (concurrent .futures .as_completed (future_to_index ), total = len (future_to_index )):
63- index = future_to_index [future ]
64- try :
65- result = future .result ()
66- results [index ] = result
67- except Exception as e :
68- print (f"Error processing item { inp_list [index ]} : { str (e )} " )
69-
70- return results
71-
72-
73- judge = LLM ('o4-mini' )
74-
75-
76- def eval_model_output (ques_dict ):
21+ def eval_model_output (ques_dict , judge_kwargs ):
7722 newline = '\n '
7823 prompt = f"""
7924You are an expert in systematically validating and evaluating LLM-generated solutions. Your task is to rigorously analyze the correctness of a provided solution by comparing it step-by-step against the reference solution, and output **only** a structured verification list—with no additional text.
25+
8026## Instructions
81271. Break down the given LLM solution into individual steps and evaluate each one against the corresponding reference solution steps.
82282. For each step, include the following three components:
@@ -87,16 +33,22 @@ def eval_model_output(ques_dict):
87334. Justify your judgments rigorously, pointing out even minor inaccuracies or logical flaws.
88345. Do not attempt to answer the original question—your role is strictly to evaluate.
89356. Output **only** a list of dictionaries in the exact format provided below. Do not include any other text or comments.
36+
9037## Question
9138{ ques_dict ['question' ]}
39+
9240## Reference Solution Steps
9341{ newline .join (ques_dict ['steps' ])}
42+
9443## Reference Answer
9544{ ques_dict ['answer' ]}
45+
9646## LLM Solution Steps
9747{ ques_dict ['prediction' ]}
48+
9849## LLM Answer
9950{ extract_final_answer (ques_dict ['prediction' ])}
51+
10052## Output Example
10153[
10254 {{"solution_step": "step content", "reason": "reason of the judgement", "judge": "correct or incorrect"}},
@@ -105,7 +57,12 @@ def eval_model_output(ques_dict):
10557"""
10658
10759 try :
108- llm_judge = judge (prompt )
60+ messages = [
61+ {"role" : "system" , "value" : "You are a helpful assistant." , "type" : "text" },
62+ {"role" : "user" , "value" : prompt , "type" : "text" },
63+ ]
64+ judge = build_judge (** judge_kwargs )
65+ llm_judge = judge .generate (messages )
10966 start_index = llm_judge .find ('[' )
11067 end_index = llm_judge .rfind (']' ) + 1
11168 llm_judge = eval (repair_json (llm_judge [start_index :end_index ]))
@@ -114,12 +71,13 @@ def eval_model_output(ques_dict):
11471 if step ["judge" ] == "correct" :
11572 correct_step_count += 1
11673 step_level_acc = correct_step_count / len (llm_judge )
117- except :
74+ except Exception as e :
75+ print (e )
11876 llm_judge = None
11977
12078 ques_dict ['exact_match' ] = 1 if (
121- ques_dict ['answer' ] == ques_dict ['prediction' ] or ques_dict ['answer' ] == extract_final_answer (
122- ques_dict ['prediction' ])) else 0
79+ ques_dict ['answer' ] == ques_dict ['prediction' ] or ques_dict ['answer' ] == extract_final_answer (
80+ ques_dict ['prediction' ])) else 0
12381 ques_dict ['llm_judge' ] = llm_judge
12482 ques_dict ['step_level_acc' ] = step_level_acc
12583 return ques_dict
@@ -133,7 +91,7 @@ def supported_datasets(cls):
13391 return ["SGI-DeepResearch" ]
13492
13593 def load_data (self , dataset ):
136- hf = load_dataset ("InternScience/SGI-DeepResearch" ,split = "test" )
94+ hf = load_dataset ("InternScience/SGI-DeepResearch" , split = "test" )
13795
13896 rows : List [Dict [str , Any ]] = []
13997 idx = 0
@@ -153,12 +111,12 @@ def load_data(self, dataset):
153111 idx += 1
154112 return pd .DataFrame (rows )
155113
156-
157114 def build_prompt (self , line ):
158115 if isinstance (line , int ):
159116 line = self .data .iloc [line ]
160117 question = line ['question' ] + """
161118You can reason step by step before giving the final answer. The final answer should be enclosed by <answer> and </answer>.
119+
162120Example:
163121Step 1. ...
164122Step 2. ...
@@ -173,8 +131,22 @@ def evaluate(self, eval_file, **judge_kwargs):
173131 data = load (eval_file )
174132 data = pd .DataFrame (data )
175133
176- inp_list = [{"ques_dict" : item } for item in data .to_dict (orient = "records" )]
177- out_list = multi_process (inp_list , eval_model_output , 48 )
134+ if judge_kwargs .get ('model' ) is None :
135+ judge_kwargs ['model' ] = 'o4-mini'
136+ if judge_kwargs .get ('max_tokens' ) is None :
137+ judge_kwargs ['max_tokens' ] = None
138+
139+ inp_list = []
140+ for item in data .to_dict (orient = "records" ):
141+ inp_list .append ({
142+ "ques_dict" : item ,
143+ "judge_kwargs" : judge_kwargs
144+ })
145+ out_list = track_progress_rich (
146+ func = eval_model_output ,
147+ tasks = inp_list ,
148+ nproc = judge_kwargs .get ('nproc' , 48 )
149+ )
178150
179151 exact_match = sum ([item ['exact_match' ] for item in out_list ]) / len (out_list )
180152 step_level_acc = sum ([item ['step_level_acc' ] for item in out_list ]) / len (out_list )
0 commit comments