codeflash-ai
diff --git a/‎codeflash/optimization/parallel_evaluator.py‎
Lines changed: 132 additions & 73 deletions b/‎codeflash/optimization/parallel_evaluator.py‎
Lines changed: 132 additions & 73 deletions
@@ -36,8 +36,24 @@ class EvalFailure:
     diffs: list[TestDiff] = dataclasses.field(default_factory=list)
 
 
+@dataclasses.dataclass(slots=True)
+class _BehavioralPass:
+    """Intermediate result: candidate passed behavioral tests, ready for benchmarking."""
+
+    slot: WorktreeSlot
+    candidate_index: int
+    perf_test_files: list[str]
+    test_env: dict[str, str]
+    pytest_cmd_list: list[str]
+
+
 class ParallelCandidateEvaluator:
-    """Evaluates optimization candidates in parallel using git worktrees."""
+    """Evaluates optimization candidates in parallel using git worktrees.
+
+    Two-phase evaluation:
+      Phase 1 (concurrent): behavioral correctness tests
+      Phase 2 (sequential): benchmarking — one candidate at a time for accurate timing
+    """
 
     def __init__(self, optimizer: FunctionOptimizer, pool_size: int = 4) -> None:
         self._optimizer = optimizer
@@ -53,31 +69,49 @@ async def evaluate_candidates(
         original_helper_code: dict[Path, str],
         file_path_to_helper_classes: dict[Path, set[str]],
     ) -> list[tuple[CandidateNode, Result[OptimizedCandidateResult, EvalFailure] | None]]:
-        """Evaluate candidates concurrently using worktrees."""
+        """Evaluate candidates: behavioral tests concurrently, benchmarks sequentially."""
         results: list[tuple[CandidateNode, Result[OptimizedCandidateResult, EvalFailure] | None]] = [
             (node, None) for node, _, _ in candidates
         ]
 
-        if candidates:
-            async with WorktreePool(pool_size=self._pool_size) as pool:
-                self._pool = pool
-                async with anyio.create_task_group() as tg:
-                    for i, (node, idx, _cached) in enumerate(candidates):
-                        tg.start_soon(
-                            self._evaluate_and_store,
-                            i,
-                            node,
-                            idx,
-                            code_context,
-                            original_code_baseline,
-                            original_helper_code,
-                            file_path_to_helper_classes,
-                            results,
-                        )
+        if not candidates:
+            return results
+
+        async with WorktreePool(pool_size=self._pool_size) as pool:
+            self._pool = pool
+
+            # Phase 1: concurrent behavioral tests
+            behavioral_passes: list[tuple[int, CandidateNode, _BehavioralPass]] = []
+
+            async with anyio.create_task_group() as tg:
+                for i, (node, idx, _cached) in enumerate(candidates):
+                    tg.start_soon(
+                        self._behavioral_phase,
+                        i,
+                        node,
+                        idx,
+                        code_context,
+                        original_code_baseline,
+                        original_helper_code,
+                        file_path_to_helper_classes,
+                        results,
+                        behavioral_passes,
+                    )
+
+            # Phase 2: sequential benchmarking (no CPU contention)
+            for result_index, candidate_node, bp in behavioral_passes:
+                try:
+                    bench_result = await self._benchmark_phase(bp, original_code_baseline)
+                    results[result_index] = (candidate_node, bench_result)
+                except Exception as exc:
+                    logger.error(f"Benchmark for {candidate_node.candidate.optimization_id} raised: {exc}")
+                    results[result_index] = (candidate_node, Failure(EvalFailure(message=str(exc))))
+                finally:
+                    await pool.release(bp.slot)
 
         return results
 
-    async def _evaluate_and_store(
+    async def _behavioral_phase(
         self,
         result_index: int,
         candidate_node: CandidateNode,
@@ -87,65 +121,44 @@ async def _evaluate_and_store(
         original_helper_code: dict[Path, str],
         file_path_to_helper_classes: dict[Path, set[str]],
         results: list[tuple[CandidateNode, Result[OptimizedCandidateResult, EvalFailure] | None]],
+        behavioral_passes: list[tuple[int, CandidateNode, _BehavioralPass]],
     ) -> None:
-        """Evaluate a single candidate and store the result."""
+        """Run behavioral tests for a candidate. On pass, hold the slot for benchmarking."""
         assert self._pool is not None
         slot = await self._pool.acquire()
         try:
-            result = await self._run_in_worktree(
+            outcome = await self._run_behavioral(
                 slot=slot,
                 candidate=candidate_node.candidate,
                 candidate_index=candidate_index,
                 code_context=code_context,
                 original_code_baseline=original_code_baseline,
                 original_helper_code=original_helper_code,
-                file_path_to_helper_classes=file_path_to_helper_classes,
             )
-            results[result_index] = (candidate_node, result)
         except Exception as exc:
             logger.error(f"Candidate {candidate_node.candidate.optimization_id} raised: {exc}")
             results[result_index] = (candidate_node, Failure(EvalFailure(message=str(exc))))
-        finally:
             await self._pool.release(slot)
+            return
 
-    @staticmethod
-    def _replace_and_capture(
-        opt: FunctionOptimizer,
-        code_context: CodeOptimizationContext,
-        candidate: OptimizedCandidate,
-        original_helper_code: dict[Path, str],
-    ) -> tuple[str, dict[Path, str]] | None:
-        """Apply code replacement to main tree, capture the result, restore original."""
-        fto = opt.function_to_optimize
-        try:
-            did_update = opt.replace_function_and_helpers_with_optimized_code(
-                code_context=code_context,
-                optimized_code=candidate.source_code,
-                original_helper_code=original_helper_code,
-            )
-            if not did_update:
-                return None
+        if isinstance(outcome, Failure):
+            results[result_index] = (candidate_node, outcome)
+            await self._pool.release(slot)
+            return
 
-            fto_code = Path(fto.file_path).read_text("utf-8")
-            helper_codes = {Path(p): Path(p).read_text("utf-8") for p in original_helper_code}
-            return fto_code, helper_codes
-        except (ValueError, SyntaxError, AttributeError) as e:
-            logger.error(f"Code replacement failed: {e}")
-            return None
-        finally:
-            opt.write_code_and_helpers(opt.function_to_optimize_source_code, original_helper_code, fto.file_path)
+        # Behavioral pass — hold the slot for Phase 2
+        behavioral_passes.append((result_index, candidate_node, outcome.unwrap()))
 
-    async def _run_in_worktree(
+    async def _run_behavioral(
         self,
         slot: WorktreeSlot,
         candidate: OptimizedCandidate,
         candidate_index: int,
         code_context: CodeOptimizationContext,
         original_code_baseline: OriginalCodeBaseline,
         original_helper_code: dict[Path, str],
-        file_path_to_helper_classes: dict[Path, set[str]],
-    ) -> Result[OptimizedCandidateResult, EvalFailure]:
-        """Run behavioral and performance tests for a candidate in a worktree slot."""
+    ) -> Result[_BehavioralPass, EvalFailure]:
+        """Run behavioral tests in a worktree. Returns pass info or failure."""
         opt = self._optimizer
         fto = opt.function_to_optimize
 
@@ -162,7 +175,7 @@ async def _run_in_worktree(
         for module_abspath, helper_code in helper_codes.items():
             await slot.write_candidate(module_abspath, helper_code)
 
-        # Copy instrumented test files into the worktree (they're runtime-generated, not in git)
+        # Copy instrumented test files into the worktree
         behavior_test_files: list[str] = []
         perf_test_files: list[str] = []
         for file in opt.test_files.test_files:
@@ -177,8 +190,7 @@ async def _run_in_worktree(
                 )
                 perf_test_files.append(str(slot.mirror(file.benchmarking_file_path)))
 
-        # Run behavioral tests in the worktree
-        worktree_cwd = slot.path
+        # Build test environment and command
         test_env = opt.get_test_env(
             codeflash_loop_index=0, codeflash_test_iteration=candidate_index, codeflash_tracer_disable=1
         )
@@ -189,6 +201,15 @@ async def _run_in_worktree(
 
         pytest_cmd_list = opt.language_support.build_pytest_cmd(SAFE_SYS_EXECUTABLE, IS_POSIX)  # type: ignore[attr-defined]
 
+        blocklisted_plugins = ["benchmark", "codspeed", "xdist", "sugar"]
+        blocklist_args = [f"-p no:{plugin}" for plugin in blocklisted_plugins]
+
+        result_file_path = get_run_tmp_file(Path(f"pytest_results_candidate_{candidate_index}.xml"))
+        result_args = [f"--junitxml={result_file_path.as_posix()}", "-o", "junit_logging=all"]
+
+        pytest_test_env = test_env.copy()
+        pytest_test_env["PYTEST_PLUGINS"] = "codeflash.verification.pytest_plugin"
+
         common_pytest_args = [
             "--capture=tee-sys",
             "-q",
@@ -199,20 +220,11 @@ async def _run_in_worktree(
             f"--timeout={INDIVIDUAL_TESTCASE_TIMEOUT}",
         ]
 
-        blocklisted_plugins = ["benchmark", "codspeed", "xdist", "sugar"]
-        blocklist_args = [f"-p no:{plugin}" for plugin in blocklisted_plugins]
-
-        result_file_path = get_run_tmp_file(Path(f"pytest_results_candidate_{candidate_index}.xml"))
-        result_args = [f"--junitxml={result_file_path.as_posix()}", "-o", "junit_logging=all"]
-
-        pytest_test_env = test_env.copy()
-        pytest_test_env["PYTEST_PLUGINS"] = "codeflash.verification.pytest_plugin"
-
         cmd = pytest_cmd_list + common_pytest_args + blocklist_args + result_args + behavior_test_files
 
         try:
             behavior_result = await async_execute_test_subprocess(
-                cmd_list=cmd, cwd=worktree_cwd, env=pytest_test_env, timeout=600
+                cmd_list=cmd, cwd=slot.path, env=pytest_test_env, timeout=600
             )
         except subprocess.TimeoutExpired:
             logger.warning(f"Behavioral test timeout for candidate {candidate_index}")
@@ -236,8 +248,26 @@ async def _run_in_worktree(
         if not match:
             return Failure(EvalFailure(message=f"Behavioral mismatch: {len(diffs)} diffs", diffs=diffs))
 
-        # Run performance tests in the worktree
-        perf_result_file = get_run_tmp_file(Path(f"pytest_perf_candidate_{candidate_index}.xml"))
+        return Success(
+            _BehavioralPass(
+                slot=slot,
+                candidate_index=candidate_index,
+                perf_test_files=perf_test_files,
+                test_env=pytest_test_env,
+                pytest_cmd_list=pytest_cmd_list,
+            )
+        )
+
+    async def _benchmark_phase(
+        self, bp: _BehavioralPass, original_code_baseline: OriginalCodeBaseline
+    ) -> Result[OptimizedCandidateResult, EvalFailure]:
+        """Run performance benchmarks sequentially for a candidate that passed behavioral tests."""
+        opt = self._optimizer
+
+        blocklisted_plugins = ["benchmark", "codspeed", "xdist", "sugar"]
+        blocklist_args = [f"-p no:{plugin}" for plugin in blocklisted_plugins]
+
+        perf_result_file = get_run_tmp_file(Path(f"pytest_perf_candidate_{bp.candidate_index}.xml"))
         perf_result_args = [f"--junitxml={perf_result_file.as_posix()}", "-o", "junit_logging=all"]
 
         perf_pytest_args = [
@@ -250,14 +280,16 @@ async def _run_in_worktree(
             f"--timeout={INDIVIDUAL_TESTCASE_TIMEOUT}",
         ]
 
-        perf_cmd = pytest_cmd_list + perf_pytest_args + blocklist_args + perf_result_args + perf_test_files
+        perf_cmd = bp.pytest_cmd_list + perf_pytest_args + blocklist_args + perf_result_args + bp.perf_test_files
 
         try:
-            await async_execute_test_subprocess(cmd_list=perf_cmd, cwd=worktree_cwd, env=pytest_test_env, timeout=600)
+            await async_execute_test_subprocess(cmd_list=perf_cmd, cwd=bp.slot.path, env=bp.test_env, timeout=600)
         except subprocess.TimeoutExpired:
-            logger.warning(f"Performance test timeout for candidate {candidate_index}")
+            logger.warning(f"Performance test timeout for candidate {bp.candidate_index}")
             return Failure(EvalFailure(message="Performance test timeout"))
 
+        from codeflash.verification.parse_test_output import parse_test_xml
+
         perf_test_results = parse_test_xml(perf_result_file, test_files=opt.test_files, test_config=opt.test_cfg)
 
         if not perf_test_results.test_results:
@@ -275,16 +307,43 @@ async def _run_in_worktree(
             OptimizedCandidateResult(
                 max_loop_count=loop_count,
                 best_test_runtime=total_timing,
-                behavior_test_results=behavior_test_results,
+                behavior_test_results=None,
                 benchmarking_test_results=perf_test_results,
                 replay_benchmarking_test_results=None,
-                optimization_candidate_index=candidate_index,
+                optimization_candidate_index=bp.candidate_index,
                 total_candidate_timing=total_timing,
                 async_throughput=None,
                 concurrency_metrics=None,
             )
         )
 
+    @staticmethod
+    def _replace_and_capture(
+        opt: FunctionOptimizer,
+        code_context: CodeOptimizationContext,
+        candidate: OptimizedCandidate,
+        original_helper_code: dict[Path, str],
+    ) -> tuple[str, dict[Path, str]] | None:
+        """Apply code replacement to main tree, capture the result, restore original."""
+        fto = opt.function_to_optimize
+        try:
+            did_update = opt.replace_function_and_helpers_with_optimized_code(
+                code_context=code_context,
+                optimized_code=candidate.source_code,
+                original_helper_code=original_helper_code,
+            )
+            if not did_update:
+                return None
+
+            fto_code = Path(fto.file_path).read_text("utf-8")
+            helper_codes = {Path(p): Path(p).read_text("utf-8") for p in original_helper_code}
+            return fto_code, helper_codes
+        except (ValueError, SyntaxError, AttributeError) as e:
+            logger.error(f"Code replacement failed: {e}")
+            return None
+        finally:
+            opt.write_code_and_helpers(opt.function_to_optimize_source_code, original_helper_code, fto.file_path)
+
 
 def run_parallel_evaluation(
     optimizer: FunctionOptimizer,