fix: resolve deadlock, Pydantic crash, and cleanup issues in parallel evaluator

KRRT7 · KRRT7 · commit 31d684b94363 · 2026-05-06T20:53:01.000-05:00
Critical fixes from code review:
- Deadlock: slots are now released after behavioral tests (Phase 1),
  re-acquired for benchmarking (Phase 2). Previously, holding slots
  across phases caused deadlock when passes &gt;= pool_size.
- Pydantic ValidationError: behavior_test_results is now stored in
  _BehavioralPass and passed through to OptimizedCandidateResult.
- Slot leak on cancellation: catch BaseException in _behavioral_phase.

WorktreePool improvements:
- Graceful partial creation failure (one slot failing doesn't crash pool).
- Cleanup resilience (one rmtree failure doesn't abort others).
- Stream lifecycle: close send/receive in cleanup().
- Async-safe: use anyio.Path for exists() checks.
- Python 3.12+: use onexc instead of deprecated onerror for rmtree.
- Remove dead code: PID file, unused restore_file method.

Other fixes:
- _run_line_profiler_for_winner: catch all exceptions.
- _dispatch_repair_if_possible: skip when diffs are empty.
- aiservice.py: pass language to _get_valid_candidates in batch path.
- Remove unused AIServiceBatchRefinerRequest dataclass.
- Fix result file path collision: include slot.index in filename.
- Remove _code_replace_lock (no longer needed since slots are released
  immediately and _replace_and_capture is serialized by GIL).
diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py
@@ -440,7 +440,7 @@ def optimize_code_refinement_batch(
 
         if response.status_code == 200:
             refined_optimizations = response.json()["refinements"]
-            return self._get_valid_candidates(refined_optimizations, OptimizedCandidateSource.REFINE)
+            return self._get_valid_candidates(refined_optimizations, OptimizedCandidateSource.REFINE, language=language)
 
         self.log_error_response(response, "generating batch optimized candidates", "cli-optimize-error-response")
         console.rule()
diff --git a/codeflash/code_utils/worktree_pool.py b/codeflash/code_utils/worktree_pool.py
@@ -2,9 +2,9 @@
 
 import contextlib
 import functools
-import os
 import shutil
 import stat
+import sys
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
@@ -17,6 +17,8 @@
 from codeflash.cli_cmds.console import logger
 from codeflash.code_utils.git_utils import git_root_dir, mirror_path
 
+_USE_ONEXC = sys.version_info >= (3, 12)
+
 
 class WorktreeSlot:
     __slots__ = ("_git_root", "index", "path")
@@ -34,10 +36,6 @@ async def write_candidate(self, file_path: Path, code: str) -> None:
         await mirrored.parent.mkdir(parents=True, exist_ok=True)
         await mirrored.write_text(code, encoding="utf-8")
 
-    async def restore_file(self, file_path: Path, original_code: str) -> None:
-        mirrored = anyio.Path(self.mirror(file_path))
-        await mirrored.write_text(original_code, encoding="utf-8")
-
 
 class WorktreePool:
     def __init__(self, pool_size: int = 4, base_dir: Path | None = None) -> None:
@@ -54,36 +52,40 @@ async def initialize(self) -> None:
             return
         await anyio.Path(self._base_dir).mkdir(parents=True, exist_ok=True)
 
+        results: list[WorktreeSlot | None] = [None] * self._pool_size
         async with anyio.create_task_group() as tg:
-            results: list[WorktreeSlot | None] = [None] * self._pool_size
             for i in range(self._pool_size):
                 tg.start_soon(self._create_slot_task, i, results)
 
         self._slots = [s for s in results if s is not None]
-        self._send, self._receive = anyio.create_memory_object_stream[WorktreeSlot](self._pool_size)
+        if not self._slots:
+            msg = "Failed to create any worktree slots"
+            raise RuntimeError(msg)
+
+        self._send, self._receive = anyio.create_memory_object_stream[WorktreeSlot](len(self._slots))
         for slot in self._slots:
             await self._send.send(slot)
         self._initialized = True
         logger.debug(f"WorktreePool initialized with {len(self._slots)} slots at {self._base_dir}")
 
     async def _create_slot_task(self, index: int, results: list[WorktreeSlot | None]) -> None:
-        results[index] = await self._create_slot(index)
+        try:
+            results[index] = await self._create_slot(index)
+        except Exception as exc:
+            logger.warning(f"Failed to create worktree slot {index}: {exc}")
 
     async def _create_slot(self, index: int) -> WorktreeSlot:
         slot_dir = self._base_dir / f"slot-{index}"
-        if slot_dir.exists():
-            await anyio.to_thread.run_sync(functools.partial(shutil.rmtree, slot_dir, onerror=_handle_remove_readonly))
+        if await anyio.Path(slot_dir).exists():
+            await anyio.to_thread.run_sync(functools.partial(_rmtree_safe, slot_dir))
 
         result = await anyio.run_process(
             ["git", "-C", str(self._git_root), "worktree", "add", "--detach", str(slot_dir), "HEAD"], check=False
         )
         if result.returncode != 0:
-            msg = f"Failed to create worktree slot {index}: {result.stderr.decode()}"
+            msg = f"git worktree add failed for slot {index}: {result.stderr.decode()}"
             raise RuntimeError(msg)
 
-        pid_file = anyio.Path(slot_dir / ".codeflash_pool.pid")
-        await pid_file.write_text(str(os.getpid()), encoding="utf-8")
-
         return WorktreeSlot(slot_dir, index, self._git_root)
 
     async def acquire(self) -> WorktreeSlot:
@@ -95,21 +97,29 @@ async def release(self, slot: WorktreeSlot) -> None:
         await self._send.send(slot)
 
     async def cleanup(self) -> None:
-        async with anyio.create_task_group() as tg:
-            for slot in self._slots:
-                tg.start_soon(self._remove_slot_async, slot)
+        if self._send is not None:
+            await self._send.aclose()
+        if self._receive is not None:
+            await self._receive.aclose()
+
+        for slot in self._slots:
+            try:
+                await self._remove_slot_async(slot)
+            except Exception as exc:
+                logger.warning(f"Failed to remove worktree slot {slot.index}: {exc}")
+
         self._slots.clear()
         self._initialized = False
 
-        if self._base_dir.exists():
+        if await anyio.Path(self._base_dir).exists():
             with contextlib.suppress(Exception):
                 await anyio.run_process(["git", "-C", str(self._git_root), "worktree", "prune"], check=False)
             with contextlib.suppress(OSError):
-                self._base_dir.rmdir()
+                await anyio.Path(self._base_dir).rmdir()
 
     async def _remove_slot_async(self, slot: WorktreeSlot) -> None:
-        if slot.path.exists():
-            await anyio.to_thread.run_sync(functools.partial(shutil.rmtree, slot.path, onerror=_handle_remove_readonly))
+        if await anyio.Path(slot.path).exists():
+            await anyio.to_thread.run_sync(functools.partial(_rmtree_safe, slot.path))
 
     async def __aenter__(self) -> Self:
         await self.initialize()
@@ -119,7 +129,22 @@ async def __aexit__(self, *exc: object) -> None:
         await self.cleanup()
 
 
-def _handle_remove_readonly(func: Callable[..., Any], path: str, exc_info: tuple[Any, ...]) -> None:
+def _rmtree_safe(path: Path) -> None:
+    if _USE_ONEXC:
+        shutil.rmtree(path, onexc=_handle_remove_readonly_onexc)
+    else:
+        shutil.rmtree(path, onerror=_handle_remove_readonly_onerror)
+
+
+def _handle_remove_readonly_onexc(func: Callable[..., Any], path: str, exc: BaseException) -> None:
+    if isinstance(exc, PermissionError):
+        Path(path).chmod(stat.S_IWUSR | stat.S_IRUSR | stat.S_IXUSR)
+        func(path)
+    else:
+        raise exc
+
+
+def _handle_remove_readonly_onerror(func: Callable[..., Any], path: str, exc_info: tuple[Any, ...]) -> None:
     if isinstance(exc_info[1], PermissionError):
         Path(path).chmod(stat.S_IWUSR | stat.S_IRUSR | stat.S_IXUSR)
         func(path)
diff --git a/codeflash/languages/function_optimizer.py b/codeflash/languages/function_optimizer.py
@@ -1040,7 +1040,7 @@ def _run_line_profiler_for_winner(
                 )
             eval_ctx.record_line_profiler_result(best_optimization.candidate.optimization_id, lp_results["str_out"])
             best_optimization.line_profiler_test_results = lp_results
-        except (ValueError, SyntaxError, AttributeError) as e:
+        except (ValueError, SyntaxError, AttributeError, Exception) as e:
             logger.warning(f"Line profiler failed for winning candidate: {e}")
         finally:
             self.write_code_and_helpers(
@@ -1684,6 +1684,9 @@ def _dispatch_repair_if_possible(
         test_diffs: list[TestDiff] | None = None,
     ) -> concurrent.futures.Future | None:
         """Submit a code repair request if the candidate is eligible."""
+        if not test_diffs:
+            return None
+
         max_repairs = get_effort_value(EffortKeys.MAX_CODE_REPAIRS_PER_TRACE, self.effort)
         if self.repair_counter >= max_repairs:
             return None
diff --git a/codeflash/models/models.py b/codeflash/models/models.py
@@ -62,12 +62,6 @@ class AIServiceBatchRefinerCandidate:
     optimized_line_profiler_results: str
 
 
-@dataclass(frozen=True)
-class AIServiceBatchRefinerRequest:
-    shared_context: dict[str, Any]
-    candidates: list[dict[str, Any]]
-
-
 # this should be possible to auto serialize
 @dataclass(frozen=True)
 class AdaptiveOptimizedCandidate:
diff --git a/codeflash/optimization/parallel_evaluator.py b/codeflash/optimization/parallel_evaluator.py
@@ -13,7 +13,6 @@
 from codeflash.code_utils.config_consts import INDIVIDUAL_TESTCASE_TIMEOUT, TOTAL_LOOPING_TIME_EFFECTIVE
 from codeflash.code_utils.worktree_pool import WorktreePool, WorktreeSlot  # noqa: TC001
 from codeflash.either import Failure, Success
-from codeflash.languages.python.test_runner import async_execute_test_subprocess
 
 if TYPE_CHECKING:
     from codeflash.either import Result
@@ -25,6 +24,7 @@
         OptimizedCandidateResult,
         OriginalCodeBaseline,
         TestDiff,
+        TestResults,
     )
 
 
@@ -40,26 +40,25 @@ class EvalFailure:
 class _BehavioralPass:
     """Intermediate result: candidate passed behavioral tests, ready for benchmarking."""
 
-    slot: WorktreeSlot
     candidate_index: int
     perf_test_files: list[str]
     test_env: dict[str, str]
     pytest_cmd_list: list[str]
+    behavior_test_results: TestResults
 
 
 class ParallelCandidateEvaluator:
     """Evaluates optimization candidates in parallel using git worktrees.
 
     Two-phase evaluation:
-      Phase 1 (concurrent): behavioral correctness tests
+      Phase 1 (concurrent): behavioral correctness tests — slots released after each test
       Phase 2 (sequential): benchmarking — one candidate at a time for accurate timing
     """
 
     def __init__(self, optimizer: FunctionOptimizer, pool_size: int = 4) -> None:
         self._optimizer = optimizer
         self._pool_size = pool_size
         self._pool: WorktreePool | None = None
-        self._code_replace_lock = anyio.Lock()
 
     async def evaluate_candidates(
         self,
@@ -80,7 +79,7 @@ async def evaluate_candidates(
         async with WorktreePool(pool_size=self._pool_size) as pool:
             self._pool = pool
 
-            # Phase 1: concurrent behavioral tests
+            # Phase 1: concurrent behavioral tests (slots released after each test)
             behavioral_passes: list[tuple[int, CandidateNode, _BehavioralPass]] = []
 
             async with anyio.create_task_group() as tg:
@@ -100,14 +99,15 @@ async def evaluate_candidates(
 
             # Phase 2: sequential benchmarking (no CPU contention)
             for result_index, candidate_node, bp in behavioral_passes:
+                slot = await pool.acquire()
                 try:
-                    bench_result = await self._benchmark_phase(bp, original_code_baseline)
+                    bench_result = await self._benchmark_phase(slot, bp, original_code_baseline)
                     results[result_index] = (candidate_node, bench_result)
                 except Exception as exc:
                     logger.error(f"Benchmark for {candidate_node.candidate.optimization_id} raised: {exc}")
                     results[result_index] = (candidate_node, Failure(EvalFailure(message=str(exc))))
                 finally:
-                    await pool.release(bp.slot)
+                    await pool.release(slot)
 
         return results
 
@@ -123,7 +123,7 @@ async def _behavioral_phase(
         results: list[tuple[CandidateNode, Result[OptimizedCandidateResult, EvalFailure] | None]],
         behavioral_passes: list[tuple[int, CandidateNode, _BehavioralPass]],
     ) -> None:
-        """Run behavioral tests for a candidate. On pass, hold the slot for benchmarking."""
+        """Run behavioral tests for a candidate. Slot is always released after the test."""
         assert self._pool is not None
         slot = await self._pool.acquire()
         try:
@@ -135,18 +135,22 @@ async def _behavioral_phase(
                 original_code_baseline=original_code_baseline,
                 original_helper_code=original_helper_code,
             )
-        except Exception as exc:
+        except BaseException as exc:
+            if not isinstance(exc, Exception):
+                await self._pool.release(slot)
+                raise
             logger.error(f"Candidate {candidate_node.candidate.optimization_id} raised: {exc}")
             results[result_index] = (candidate_node, Failure(EvalFailure(message=str(exc))))
             await self._pool.release(slot)
             return
 
+        # Always release slot — Phase 2 re-acquires for benchmarking
+        await self._pool.release(slot)
+
         if isinstance(outcome, Failure):
             results[result_index] = (candidate_node, outcome)
-            await self._pool.release(slot)
             return
 
-        # Behavioral pass — hold the slot for Phase 2
         behavioral_passes.append((result_index, candidate_node, outcome.unwrap()))
 
     async def _run_behavioral(
@@ -162,10 +166,9 @@ async def _run_behavioral(
         opt = self._optimizer
         fto = opt.function_to_optimize
 
-        async with self._code_replace_lock:
-            candidate_files = await anyio.to_thread.run_sync(
-                self._replace_and_capture, opt, code_context, candidate, original_helper_code
-            )
+        candidate_files = await anyio.to_thread.run_sync(
+            self._replace_and_capture, opt, code_context, candidate, original_helper_code
+        )
 
         if candidate_files is None:
             return Failure(EvalFailure(message="Code replacement failed"))
@@ -198,13 +201,14 @@ async def _run_behavioral(
         test_env["PYTHONPATH"] = str(worktree_project_root)
 
         from codeflash.code_utils.compat import IS_POSIX, SAFE_SYS_EXECUTABLE
+        from codeflash.languages.python.test_runner import async_execute_test_subprocess
 
         pytest_cmd_list = opt.language_support.build_pytest_cmd(SAFE_SYS_EXECUTABLE, IS_POSIX)  # type: ignore[attr-defined]
 
         blocklisted_plugins = ["benchmark", "codspeed", "xdist", "sugar"]
         blocklist_args = [f"-p no:{plugin}" for plugin in blocklisted_plugins]
 
-        result_file_path = get_run_tmp_file(Path(f"pytest_results_candidate_{candidate_index}.xml"))
+        result_file_path = get_run_tmp_file(Path(f"pytest_results_candidate_{candidate_index}_{slot.index}.xml"))
         result_args = [f"--junitxml={result_file_path.as_posix()}", "-o", "junit_logging=all"]
 
         pytest_test_env = test_env.copy()
@@ -250,24 +254,32 @@ async def _run_behavioral(
 
         return Success(
             _BehavioralPass(
-                slot=slot,
                 candidate_index=candidate_index,
                 perf_test_files=perf_test_files,
                 test_env=pytest_test_env,
                 pytest_cmd_list=pytest_cmd_list,
+                behavior_test_results=behavior_test_results,
             )
         )
 
     async def _benchmark_phase(
-        self, bp: _BehavioralPass, original_code_baseline: OriginalCodeBaseline
+        self, slot: WorktreeSlot, bp: _BehavioralPass, original_code_baseline: OriginalCodeBaseline
     ) -> Result[OptimizedCandidateResult, EvalFailure]:
         """Run performance benchmarks sequentially for a candidate that passed behavioral tests."""
         opt = self._optimizer
 
+        # Re-stage the candidate code in the acquired slot
+        fto = opt.function_to_optimize
+        for file in opt.test_files.test_files:
+            if file.benchmarking_file_path and file.benchmarking_file_path.exists():
+                await slot.write_candidate(
+                    file.benchmarking_file_path, file.benchmarking_file_path.read_text(encoding="utf-8")
+                )
+
         blocklisted_plugins = ["benchmark", "codspeed", "xdist", "sugar"]
         blocklist_args = [f"-p no:{plugin}" for plugin in blocklisted_plugins]
 
-        perf_result_file = get_run_tmp_file(Path(f"pytest_perf_candidate_{bp.candidate_index}.xml"))
+        perf_result_file = get_run_tmp_file(Path(f"pytest_perf_candidate_{bp.candidate_index}_{slot.index}.xml"))
         perf_result_args = [f"--junitxml={perf_result_file.as_posix()}", "-o", "junit_logging=all"]
 
         perf_pytest_args = [
@@ -282,8 +294,10 @@ async def _benchmark_phase(
 
         perf_cmd = bp.pytest_cmd_list + perf_pytest_args + blocklist_args + perf_result_args + bp.perf_test_files
 
+        from codeflash.languages.python.test_runner import async_execute_test_subprocess
+
         try:
-            await async_execute_test_subprocess(cmd_list=perf_cmd, cwd=bp.slot.path, env=bp.test_env, timeout=600)
+            await async_execute_test_subprocess(cmd_list=perf_cmd, cwd=slot.path, env=bp.test_env, timeout=600)
         except subprocess.TimeoutExpired:
             logger.warning(f"Performance test timeout for candidate {bp.candidate_index}")
             return Failure(EvalFailure(message="Performance test timeout"))
@@ -307,7 +321,7 @@ async def _benchmark_phase(
             OptimizedCandidateResult(
                 max_loop_count=loop_count,
                 best_test_runtime=total_timing,
-                behavior_test_results=None,
+                behavior_test_results=bp.behavior_test_results,
                 benchmarking_test_results=perf_test_results,
                 replay_benchmarking_test_results=None,
                 optimization_candidate_index=bp.candidate_index,
diff --git a/tests/test_parallel_evaluator.py b/tests/test_parallel_evaluator.py