fix: race conditions and re-staging bug in parallel evaluator

KRRT7 · KRRT7 · commit d0586fd6755f · 2026-05-06T20:53:01.000-05:00
- Add replace_lock to serialize main-tree access in _replace_and_capture
- Fix Phase 2 benchmark not writing candidate code to fresh worktree slot
- Add _closed flag and ClosedResourceError suppression in pool release
- Broaden exception handling and protect finally restore block
- Remove unused eval_ctx/exp_type params from run_parallel_evaluation
- Add tests for re-staging, partial pool init, restore-on-failure, empty candidates
diff --git a/codeflash/code_utils/worktree_pool.py b/codeflash/code_utils/worktree_pool.py
@@ -46,6 +46,7 @@ def __init__(self, pool_size: int = 4, base_dir: Path | None = None) -> None:
         self._send: anyio.abc.ObjectSendStream[WorktreeSlot] | None = None
         self._receive: anyio.abc.ObjectReceiveStream[WorktreeSlot] | None = None
         self._initialized = False
+        self._closed = False
 
     async def initialize(self) -> None:
         if self._initialized:
@@ -93,10 +94,15 @@ async def acquire(self) -> WorktreeSlot:
         return await self._receive.receive()
 
     async def release(self, slot: WorktreeSlot) -> None:
+        if self._closed:
+            return
         assert self._send is not None
-        await self._send.send(slot)
+        with contextlib.suppress(anyio.ClosedResourceError):
+            await self._send.send(slot)
 
     async def cleanup(self) -> None:
+        self._closed = True
+
         if self._send is not None:
             await self._send.aclose()
         if self._receive is not None:
diff --git a/codeflash/languages/function_optimizer.py b/codeflash/languages/function_optimizer.py
@@ -1541,8 +1541,6 @@ def _evaluate_candidates_parallel(
                 original_code_baseline=original_code_baseline,
                 original_helper_code=original_helper_code,
                 file_path_to_helper_classes=file_path_to_helper_classes,
-                eval_ctx=eval_ctx,
-                exp_type=exp_type,
                 pool_size=pool_size,
             )
 
diff --git a/codeflash/optimization/parallel_evaluator.py b/codeflash/optimization/parallel_evaluator.py
@@ -18,7 +18,6 @@
     from codeflash.either import Result
     from codeflash.languages.function_optimizer import CandidateNode, FunctionOptimizer
     from codeflash.models.models import (
-        CandidateEvaluationContext,
         CodeOptimizationContext,
         OptimizedCandidate,
         OptimizedCandidateResult,
@@ -45,6 +44,9 @@ class _BehavioralPass:
     test_env: dict[str, str]
     pytest_cmd_list: list[str]
     behavior_test_results: TestResults
+    fto_code: str
+    helper_codes: dict[Path, str]
+    fto_file_path: Path
 
 
 class ParallelCandidateEvaluator:
@@ -59,6 +61,7 @@ def __init__(self, optimizer: FunctionOptimizer, pool_size: int = 4) -> None:
         self._optimizer = optimizer
         self._pool_size = pool_size
         self._pool: WorktreePool | None = None
+        self._replace_lock = anyio.Lock()
 
     async def evaluate_candidates(
         self,
@@ -92,7 +95,6 @@ async def evaluate_candidates(
                         code_context,
                         original_code_baseline,
                         original_helper_code,
-                        file_path_to_helper_classes,
                         results,
                         behavioral_passes,
                     )
@@ -119,7 +121,6 @@ async def _behavioral_phase(
         code_context: CodeOptimizationContext,
         original_code_baseline: OriginalCodeBaseline,
         original_helper_code: dict[Path, str],
-        file_path_to_helper_classes: dict[Path, set[str]],
         results: list[tuple[CandidateNode, Result[OptimizedCandidateResult, EvalFailure] | None]],
         behavioral_passes: list[tuple[int, CandidateNode, _BehavioralPass]],
     ) -> None:
@@ -135,16 +136,10 @@ async def _behavioral_phase(
                 original_code_baseline=original_code_baseline,
                 original_helper_code=original_helper_code,
             )
-        except BaseException as exc:
-            if not isinstance(exc, Exception):
-                await self._pool.release(slot)
-                raise
-            logger.error(f"Candidate {candidate_node.candidate.optimization_id} raised: {exc}")
-            results[result_index] = (candidate_node, Failure(EvalFailure(message=str(exc))))
+        except BaseException:
             await self._pool.release(slot)
-            return
+            raise
 
-        # Always release slot — Phase 2 re-acquires for benchmarking
         await self._pool.release(slot)
 
         if isinstance(outcome, Failure):
@@ -166,9 +161,11 @@ async def _run_behavioral(
         opt = self._optimizer
         fto = opt.function_to_optimize
 
-        candidate_files = await anyio.to_thread.run_sync(
-            self._replace_and_capture, opt, code_context, candidate, original_helper_code
-        )
+        # Serialize main-tree access: replace_and_capture writes/reads/restores shared files
+        async with self._replace_lock:
+            candidate_files = await anyio.to_thread.run_sync(
+                self._replace_and_capture, opt, code_context, candidate, original_helper_code
+            )
 
         if candidate_files is None:
             return Failure(EvalFailure(message="Code replacement failed"))
@@ -259,6 +256,9 @@ async def _run_behavioral(
                 test_env=pytest_test_env,
                 pytest_cmd_list=pytest_cmd_list,
                 behavior_test_results=behavior_test_results,
+                fto_code=fto_code,
+                helper_codes=helper_codes,
+                fto_file_path=Path(fto.file_path),
             )
         )
 
@@ -269,7 +269,10 @@ async def _benchmark_phase(
         opt = self._optimizer
 
         # Re-stage the candidate code in the acquired slot
-        fto = opt.function_to_optimize
+        await slot.write_candidate(bp.fto_file_path, bp.fto_code)
+        for module_path, code in bp.helper_codes.items():
+            await slot.write_candidate(module_path, code)
+
         for file in opt.test_files.test_files:
             if file.benchmarking_file_path and file.benchmarking_file_path.exists():
                 await slot.write_candidate(
@@ -352,11 +355,14 @@ def _replace_and_capture(
             fto_code = Path(fto.file_path).read_text("utf-8")
             helper_codes = {Path(p): Path(p).read_text("utf-8") for p in original_helper_code}
             return fto_code, helper_codes
-        except (ValueError, SyntaxError, AttributeError) as e:
+        except Exception as e:
             logger.error(f"Code replacement failed: {e}")
             return None
         finally:
-            opt.write_code_and_helpers(opt.function_to_optimize_source_code, original_helper_code, fto.file_path)
+            try:
+                opt.write_code_and_helpers(opt.function_to_optimize_source_code, original_helper_code, fto.file_path)
+            except Exception as restore_err:
+                logger.error(f"Failed to restore main tree after code replacement: {restore_err}")
 
 
 def run_parallel_evaluation(
@@ -366,8 +372,6 @@ def run_parallel_evaluation(
     original_code_baseline: OriginalCodeBaseline,
     original_helper_code: dict[Path, str],
     file_path_to_helper_classes: dict[Path, set[str]],
-    eval_ctx: CandidateEvaluationContext,
-    exp_type: str,
     pool_size: int = 4,
 ) -> tuple[list[tuple[CandidateNode, Result[OptimizedCandidateResult, EvalFailure] | None]], list, list]:
     """Entry point: run parallel candidate evaluation from sync code via anyio.
diff --git a/tests/test_parallel_evaluator.py b/tests/test_parallel_evaluator.py
@@ -5,6 +5,7 @@
 import subprocess
 import sys
 from pathlib import Path
+from typing import Any
 from unittest.mock import MagicMock, patch
 
 import anyio
@@ -41,6 +42,40 @@ async def _run() -> None:
 
         anyio.run(_run)
 
+    def test_partial_pool_initialization(self, tmp_path: Path) -> None:
+        """Pool operates at reduced capacity if some slots fail to create."""
+        from unittest.mock import patch
+
+        from codeflash.code_utils.worktree_pool import WorktreePool
+
+        pool_size = 3
+        base_dir = tmp_path.resolve() / "worktrees"
+        repo_root = Path(__file__).resolve().parents[1]
+
+        call_count = 0
+
+        original_create_slot = WorktreePool._create_slot
+
+        async def failing_create_slot(self: Any, index: int) -> Any:
+            nonlocal call_count
+            call_count += 1
+            if index == 1:
+                raise RuntimeError("Simulated git worktree failure")
+            return await original_create_slot(self, index)
+
+        async def _run() -> None:
+            with (
+                patch("codeflash.code_utils.worktree_pool.git_root_dir", return_value=repo_root),
+                patch.object(WorktreePool, "_create_slot", failing_create_slot),
+            ):
+                async with WorktreePool(pool_size=pool_size, base_dir=base_dir) as pool:
+                    assert len(pool._slots) == 2
+                    slot = await pool.acquire()
+                    assert slot.index != 1
+                    await pool.release(slot)
+
+        anyio.run(_run)
+
     def test_acquire_release_round_trip(self, tmp_path: Path) -> None:
         from unittest.mock import patch
 
@@ -275,6 +310,9 @@ async def mock_behavioral(self_eval: object, *args: object, **kwargs: object) ->
                     test_env={},
                     pytest_cmd_list=[],
                     behavior_test_results=mock_behavior_results,
+                    fto_code="def f(): pass",
+                    helper_codes={},
+                    fto_file_path=Path("/tmp/module.py"),
                 )
             )
 
@@ -324,6 +362,9 @@ async def mock_behavioral(self_eval: object, *args: object, **kwargs: object) ->
                     test_env={},
                     pytest_cmd_list=[],
                     behavior_test_results=mock_behavior_results,
+                    fto_code="def f(): pass",
+                    helper_codes={},
+                    fto_file_path=Path("/tmp/module.py"),
                 )
             )
 
@@ -355,6 +396,108 @@ async def _run() -> list:  # type: ignore[type-arg]
         for _, result in results:
             assert is_successful(result)
 
+    def test_benchmark_phase_restages_candidate_code(self, tmp_path: Path) -> None:
+        """Phase 2 must write fto_code and helper_codes to the slot before running benchmarks."""
+        from codeflash.optimization.parallel_evaluator import _BehavioralPass
+
+        opt = self._make_optimizer_mock(tmp_path)
+        (tmp_path / "src").mkdir(parents=True)
+        (tmp_path / "src" / "module.py").write_text("def f(): pass", encoding="utf-8")
+
+        node = self._make_candidate_node()
+        evaluator = ParallelCandidateEvaluator(opt, pool_size=1)
+
+        repo_root = Path(__file__).resolve().parents[1]
+        fto_code = "def f(): return 42  # optimized"
+        helper_path = tmp_path / "src" / "helpers.py"
+        helper_codes = {helper_path: "HELPER_CODE = True"}
+
+        write_calls: list[tuple[Path, str]] = []
+
+        async def tracking_write_candidate(self_slot: object, file_path: Path, code: str) -> None:
+            write_calls.append((file_path, code))
+
+        async def mock_behavioral(self_eval: object, *args: object, **kwargs: object) -> Success:  # type: ignore[type-arg]
+            return Success(
+                _BehavioralPass(
+                    candidate_index=0,
+                    perf_test_files=[],
+                    test_env={"PATH": "/usr/bin"},
+                    pytest_cmd_list=[sys.executable, "-m", "pytest"],
+                    behavior_test_results=MagicMock(),
+                    fto_code=fto_code,
+                    helper_codes=helper_codes,
+                    fto_file_path=Path(opt.function_to_optimize.file_path),
+                )
+            )
+
+        async def _run() -> list:  # type: ignore[type-arg]
+            with (
+                patch("codeflash.code_utils.worktree_pool.git_root_dir", return_value=repo_root),
+                patch.object(ParallelCandidateEvaluator, "_run_behavioral", mock_behavioral),
+                patch(
+                    "codeflash.code_utils.worktree_pool.WorktreeSlot.write_candidate", tracking_write_candidate
+                ),
+                patch(
+                    "codeflash.languages.python.test_runner.async_execute_test_subprocess",
+                    return_value=MagicMock(returncode=0, stdout="", stderr=""),
+                ),
+                patch(
+                    "codeflash.verification.parse_test_output.parse_test_xml",
+                    return_value=MagicMock(test_results=[MagicMock()], effective_loop_count=lambda: 10, total_passed_runtime=lambda: 5000),
+                ),
+            ):
+                return await evaluator.evaluate_candidates(
+                    candidates=[(node, 0, None)],
+                    code_context=MagicMock(),
+                    original_code_baseline=MagicMock(),
+                    original_helper_code={},
+                    file_path_to_helper_classes={},
+                )
+
+        anyio.run(_run)
+
+        written_codes = {p: c for p, c in write_calls}
+        assert Path(opt.function_to_optimize.file_path) in written_codes
+        assert written_codes[Path(opt.function_to_optimize.file_path)] == fto_code
+        assert helper_path in written_codes
+        assert written_codes[helper_path] == "HELPER_CODE = True"
+
+    def test_empty_candidates_returns_empty(self, tmp_path: Path) -> None:
+        opt = self._make_optimizer_mock(tmp_path)
+        evaluator = ParallelCandidateEvaluator(opt, pool_size=1)
+        repo_root = Path(__file__).resolve().parents[1]
+
+        async def _run() -> list:  # type: ignore[type-arg]
+            with patch("codeflash.code_utils.worktree_pool.git_root_dir", return_value=repo_root):
+                return await evaluator.evaluate_candidates(
+                    candidates=[],
+                    code_context=MagicMock(),
+                    original_code_baseline=MagicMock(),
+                    original_helper_code={},
+                    file_path_to_helper_classes={},
+                )
+
+        results = anyio.run(_run)
+        assert results == []
+
+    def test_replace_and_capture_restores_on_failure(self, tmp_path: Path) -> None:
+        """_replace_and_capture must restore original code even when replacement raises."""
+        opt = self._make_optimizer_mock(tmp_path)
+        (tmp_path / "src").mkdir(parents=True)
+        original_code = "def f(): pass"
+        (tmp_path / "src" / "module.py").write_text(original_code, encoding="utf-8")
+
+        opt.replace_function_and_helpers_with_optimized_code.side_effect = ValueError("bad code")
+
+        result = ParallelCandidateEvaluator._replace_and_capture(
+            opt, MagicMock(), MagicMock(), {}
+        )
+        assert result is None
+        opt.write_code_and_helpers.assert_called_once_with(
+            opt.function_to_optimize_source_code, {}, opt.function_to_optimize.file_path
+        )
+
     def test_more_candidates_than_slots_no_deadlock(self, tmp_path: Path) -> None:
         """Regression test: more passing candidates than pool slots must not deadlock."""
         from codeflash.optimization.parallel_evaluator import _BehavioralPass
@@ -379,6 +522,9 @@ async def mock_behavioral(self_eval: object, *args: object, **kwargs: object) ->
                     test_env={},
                     pytest_cmd_list=[],
                     behavior_test_results=mock_behavior_results,
+                    fto_code="def f(): pass",
+                    helper_codes={},
+                    fto_file_path=Path("/tmp/module.py"),
                 )
             )
 

Original file line number	Diff line number	Diff line change
`@@ -1541,8 +1541,6 @@ def _evaluate_candidates_parallel(`
`1541`	`1541`	`original_code_baseline=original_code_baseline,`
`1542`	`1542`	`original_helper_code=original_helper_code,`
`1543`	`1543`	`file_path_to_helper_classes=file_path_to_helper_classes,`
`1544`		`- eval_ctx=eval_ctx,`
`1545`		`- exp_type=exp_type,`
`1546`	`1544`	`pool_size=pool_size,`
`1547`	`1545`	`)`
`1548`	`1546`