@@ -114,6 +114,33 @@ def run_agent_eval(
114114
115115 results : List [Tuple [int , Dict [str , Any ], str ]] = []
116116 tasks = list (range (len (dataset )))
117+ tasks_to_run = tasks
118+ if reuse :
119+ tasks_to_run = []
120+ for idx in tasks :
121+ if do_eval :
122+ eval_cached = store .load_eval (idx )
123+ if eval_cached is not None :
124+ cached_score = eval_cached .get ("score" , eval_cached )
125+ cached_final = eval_cached .get ("final_answer" , "" )
126+ if not cached_final :
127+ traj = store .load_traj (idx )
128+ if traj is not None :
129+ cached_final = traj .get ("final_answer" , "" )
130+ results .append ((idx , cached_score , cached_final ))
131+ continue
132+ tasks_to_run .append (idx )
133+ continue
134+
135+ if do_infer :
136+ traj = store .load_traj (idx )
137+ if traj and traj .get ("success" ):
138+ results .append ((idx , {}, traj .get ("final_answer" , "" )))
139+ else :
140+ tasks_to_run .append (idx )
141+ else :
142+ tasks_to_run .append (idx )
143+
117144 if nproc > 1 :
118145 with ThreadPoolExecutor (max_workers = nproc ) as executor :
119146 futures = [
@@ -128,15 +155,15 @@ def run_agent_eval(
128155 do_infer ,
129156 do_eval ,
130157 )
131- for idx in tasks
158+ for idx in tasks_to_run
132159 ]
133- with tqdm (total = len (tasks ), desc = "Agent Eval" , unit = "sample" ) as pbar :
160+ with tqdm (total = len (tasks_to_run ), desc = "Agent Eval" , unit = "sample" ) as pbar :
134161 for fut in as_completed (futures ):
135162 results .append (fut .result ())
136163 pbar .update (1 )
137164 else :
138- with tqdm (total = len (tasks ), desc = "Agent Eval" , unit = "sample" ) as pbar :
139- for idx in tasks :
165+ with tqdm (total = len (tasks_to_run ), desc = "Agent Eval" , unit = "sample" ) as pbar :
166+ for idx in tasks_to_run :
140167 results .append (
141168 _run_one_sample (
142169 idx , agent , dataset , store , judge_kwargs , reuse , do_infer , do_eval
0 commit comments