1313from codeflash .code_utils .config_consts import INDIVIDUAL_TESTCASE_TIMEOUT , TOTAL_LOOPING_TIME_EFFECTIVE
1414from codeflash .code_utils .worktree_pool import WorktreePool , WorktreeSlot # noqa: TC001
1515from codeflash .either import Failure , Success
16- from codeflash .languages .python .test_runner import async_execute_test_subprocess
1716
1817if TYPE_CHECKING :
1918 from codeflash .either import Result
2524 OptimizedCandidateResult ,
2625 OriginalCodeBaseline ,
2726 TestDiff ,
27+ TestResults ,
2828 )
2929
3030
@@ -40,26 +40,25 @@ class EvalFailure:
4040class _BehavioralPass :
4141 """Intermediate result: candidate passed behavioral tests, ready for benchmarking."""
4242
43- slot : WorktreeSlot
4443 candidate_index : int
4544 perf_test_files : list [str ]
4645 test_env : dict [str , str ]
4746 pytest_cmd_list : list [str ]
47+ behavior_test_results : TestResults
4848
4949
5050class ParallelCandidateEvaluator :
5151 """Evaluates optimization candidates in parallel using git worktrees.
5252
5353 Two-phase evaluation:
54- Phase 1 (concurrent): behavioral correctness tests
54+ Phase 1 (concurrent): behavioral correctness tests — slots released after each test
5555 Phase 2 (sequential): benchmarking — one candidate at a time for accurate timing
5656 """
5757
5858 def __init__ (self , optimizer : FunctionOptimizer , pool_size : int = 4 ) -> None :
5959 self ._optimizer = optimizer
6060 self ._pool_size = pool_size
6161 self ._pool : WorktreePool | None = None
62- self ._code_replace_lock = anyio .Lock ()
6362
6463 async def evaluate_candidates (
6564 self ,
@@ -80,7 +79,7 @@ async def evaluate_candidates(
8079 async with WorktreePool (pool_size = self ._pool_size ) as pool :
8180 self ._pool = pool
8281
83- # Phase 1: concurrent behavioral tests
82+ # Phase 1: concurrent behavioral tests (slots released after each test)
8483 behavioral_passes : list [tuple [int , CandidateNode , _BehavioralPass ]] = []
8584
8685 async with anyio .create_task_group () as tg :
@@ -100,14 +99,15 @@ async def evaluate_candidates(
10099
101100 # Phase 2: sequential benchmarking (no CPU contention)
102101 for result_index , candidate_node , bp in behavioral_passes :
102+ slot = await pool .acquire ()
103103 try :
104- bench_result = await self ._benchmark_phase (bp , original_code_baseline )
104+ bench_result = await self ._benchmark_phase (slot , bp , original_code_baseline )
105105 results [result_index ] = (candidate_node , bench_result )
106106 except Exception as exc :
107107 logger .error (f"Benchmark for { candidate_node .candidate .optimization_id } raised: { exc } " )
108108 results [result_index ] = (candidate_node , Failure (EvalFailure (message = str (exc ))))
109109 finally :
110- await pool .release (bp . slot )
110+ await pool .release (slot )
111111
112112 return results
113113
@@ -123,7 +123,7 @@ async def _behavioral_phase(
123123 results : list [tuple [CandidateNode , Result [OptimizedCandidateResult , EvalFailure ] | None ]],
124124 behavioral_passes : list [tuple [int , CandidateNode , _BehavioralPass ]],
125125 ) -> None :
126- """Run behavioral tests for a candidate. On pass, hold the slot for benchmarking ."""
126+ """Run behavioral tests for a candidate. Slot is always released after the test ."""
127127 assert self ._pool is not None
128128 slot = await self ._pool .acquire ()
129129 try :
@@ -135,18 +135,22 @@ async def _behavioral_phase(
135135 original_code_baseline = original_code_baseline ,
136136 original_helper_code = original_helper_code ,
137137 )
138- except Exception as exc :
138+ except BaseException as exc :
139+ if not isinstance (exc , Exception ):
140+ await self ._pool .release (slot )
141+ raise
139142 logger .error (f"Candidate { candidate_node .candidate .optimization_id } raised: { exc } " )
140143 results [result_index ] = (candidate_node , Failure (EvalFailure (message = str (exc ))))
141144 await self ._pool .release (slot )
142145 return
143146
147+ # Always release slot — Phase 2 re-acquires for benchmarking
148+ await self ._pool .release (slot )
149+
144150 if isinstance (outcome , Failure ):
145151 results [result_index ] = (candidate_node , outcome )
146- await self ._pool .release (slot )
147152 return
148153
149- # Behavioral pass — hold the slot for Phase 2
150154 behavioral_passes .append ((result_index , candidate_node , outcome .unwrap ()))
151155
152156 async def _run_behavioral (
@@ -162,10 +166,9 @@ async def _run_behavioral(
162166 opt = self ._optimizer
163167 fto = opt .function_to_optimize
164168
165- async with self ._code_replace_lock :
166- candidate_files = await anyio .to_thread .run_sync (
167- self ._replace_and_capture , opt , code_context , candidate , original_helper_code
168- )
169+ candidate_files = await anyio .to_thread .run_sync (
170+ self ._replace_and_capture , opt , code_context , candidate , original_helper_code
171+ )
169172
170173 if candidate_files is None :
171174 return Failure (EvalFailure (message = "Code replacement failed" ))
@@ -198,13 +201,14 @@ async def _run_behavioral(
198201 test_env ["PYTHONPATH" ] = str (worktree_project_root )
199202
200203 from codeflash .code_utils .compat import IS_POSIX , SAFE_SYS_EXECUTABLE
204+ from codeflash .languages .python .test_runner import async_execute_test_subprocess
201205
202206 pytest_cmd_list = opt .language_support .build_pytest_cmd (SAFE_SYS_EXECUTABLE , IS_POSIX ) # type: ignore[attr-defined]
203207
204208 blocklisted_plugins = ["benchmark" , "codspeed" , "xdist" , "sugar" ]
205209 blocklist_args = [f"-p no:{ plugin } " for plugin in blocklisted_plugins ]
206210
207- result_file_path = get_run_tmp_file (Path (f"pytest_results_candidate_{ candidate_index } .xml" ))
211+ result_file_path = get_run_tmp_file (Path (f"pytest_results_candidate_{ candidate_index } _ { slot . index } .xml" ))
208212 result_args = [f"--junitxml={ result_file_path .as_posix ()} " , "-o" , "junit_logging=all" ]
209213
210214 pytest_test_env = test_env .copy ()
@@ -250,24 +254,32 @@ async def _run_behavioral(
250254
251255 return Success (
252256 _BehavioralPass (
253- slot = slot ,
254257 candidate_index = candidate_index ,
255258 perf_test_files = perf_test_files ,
256259 test_env = pytest_test_env ,
257260 pytest_cmd_list = pytest_cmd_list ,
261+ behavior_test_results = behavior_test_results ,
258262 )
259263 )
260264
261265 async def _benchmark_phase (
262- self , bp : _BehavioralPass , original_code_baseline : OriginalCodeBaseline
266+ self , slot : WorktreeSlot , bp : _BehavioralPass , original_code_baseline : OriginalCodeBaseline
263267 ) -> Result [OptimizedCandidateResult , EvalFailure ]:
264268 """Run performance benchmarks sequentially for a candidate that passed behavioral tests."""
265269 opt = self ._optimizer
266270
271+ # Re-stage the candidate code in the acquired slot
272+ fto = opt .function_to_optimize
273+ for file in opt .test_files .test_files :
274+ if file .benchmarking_file_path and file .benchmarking_file_path .exists ():
275+ await slot .write_candidate (
276+ file .benchmarking_file_path , file .benchmarking_file_path .read_text (encoding = "utf-8" )
277+ )
278+
267279 blocklisted_plugins = ["benchmark" , "codspeed" , "xdist" , "sugar" ]
268280 blocklist_args = [f"-p no:{ plugin } " for plugin in blocklisted_plugins ]
269281
270- perf_result_file = get_run_tmp_file (Path (f"pytest_perf_candidate_{ bp .candidate_index } .xml" ))
282+ perf_result_file = get_run_tmp_file (Path (f"pytest_perf_candidate_{ bp .candidate_index } _ { slot . index } .xml" ))
271283 perf_result_args = [f"--junitxml={ perf_result_file .as_posix ()} " , "-o" , "junit_logging=all" ]
272284
273285 perf_pytest_args = [
@@ -282,8 +294,10 @@ async def _benchmark_phase(
282294
283295 perf_cmd = bp .pytest_cmd_list + perf_pytest_args + blocklist_args + perf_result_args + bp .perf_test_files
284296
297+ from codeflash .languages .python .test_runner import async_execute_test_subprocess
298+
285299 try :
286- await async_execute_test_subprocess (cmd_list = perf_cmd , cwd = bp . slot .path , env = bp .test_env , timeout = 600 )
300+ await async_execute_test_subprocess (cmd_list = perf_cmd , cwd = slot .path , env = bp .test_env , timeout = 600 )
287301 except subprocess .TimeoutExpired :
288302 logger .warning (f"Performance test timeout for candidate { bp .candidate_index } " )
289303 return Failure (EvalFailure (message = "Performance test timeout" ))
@@ -307,7 +321,7 @@ async def _benchmark_phase(
307321 OptimizedCandidateResult (
308322 max_loop_count = loop_count ,
309323 best_test_runtime = total_timing ,
310- behavior_test_results = None ,
324+ behavior_test_results = bp . behavior_test_results ,
311325 benchmarking_test_results = perf_test_results ,
312326 replay_benchmarking_test_results = None ,
313327 optimization_candidate_index = bp .candidate_index ,
0 commit comments