test: parallel evaluation unit and integration tests

KRRT7 · KRRT7 · commit 9677b5604653 · 2026-05-06T19:24:34.000-05:00
- WorktreePool: lifecycle, acquire/release, file isolation
- async_execute_test_subprocess: stdout, stderr, timeout
- ParallelCandidateEvaluator: code replacement failure, behavioral
  mismatch with diffs, successful routing, concurrent evaluation
diff --git a/tests/test_parallel_evaluator.py b/tests/test_parallel_evaluator.py
@@ -0,0 +1,344 @@
+"""Integration tests for the parallel candidate evaluation infrastructure."""
+
+from __future__ import annotations
+
+import subprocess
+import sys
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import anyio
+import pytest
+
+from codeflash.either import Failure, Success, is_successful
+from codeflash.languages.function_optimizer import CandidateNode
+from codeflash.optimization.parallel_evaluator import EvalFailure, ParallelCandidateEvaluator
+
+
+class TestWorktreePoolLifecycle:
+    def test_creates_n_worktrees_and_cleans_up(self, tmp_path: Path) -> None:
+        from unittest.mock import patch
+
+        from codeflash.code_utils.worktree_pool import WorktreePool
+
+        pool_size = 3
+        base_dir = tmp_path.resolve() / "worktrees"
+
+        # The pool needs a git root. We use the codeflash repo itself.
+        repo_root = Path(__file__).resolve().parents[1]
+
+        async def _run() -> None:
+            with patch("codeflash.code_utils.worktree_pool.git_root_dir", return_value=repo_root):
+                pool = WorktreePool(pool_size=pool_size, base_dir=base_dir)
+                async with pool:
+                    assert len(pool._slots) == pool_size
+                    for slot in pool._slots:
+                        assert slot.path.exists()
+                        assert slot.path.is_dir()
+                        assert (slot.path / ".codeflash_pool.pid").exists()
+
+                # After cleanup, slots are cleared
+                assert len(pool._slots) == 0
+
+        anyio.run(_run)
+
+    def test_acquire_release_round_trip(self, tmp_path: Path) -> None:
+        from unittest.mock import patch
+
+        from codeflash.code_utils.worktree_pool import WorktreePool
+
+        pool_size = 2
+        base_dir = tmp_path.resolve() / "worktrees"
+        repo_root = Path(__file__).resolve().parents[1]
+
+        async def _run() -> None:
+            with patch("codeflash.code_utils.worktree_pool.git_root_dir", return_value=repo_root):
+                async with WorktreePool(pool_size=pool_size, base_dir=base_dir) as pool:
+                    slot1 = await pool.acquire()
+                    slot2 = await pool.acquire()
+
+                    # Both slots should be distinct
+                    assert slot1.index != slot2.index
+                    assert slot1.path != slot2.path
+
+                    # Release one and re-acquire it
+                    await pool.release(slot1)
+                    reacquired = await pool.acquire()
+                    assert reacquired.index == slot1.index
+
+                    await pool.release(slot2)
+                    await pool.release(reacquired)
+
+        anyio.run(_run)
+
+
+class TestWorktreeSlotFileIsolation:
+    def test_write_to_one_slot_does_not_affect_another(self, tmp_path: Path) -> None:
+        from unittest.mock import patch
+
+        from codeflash.code_utils.worktree_pool import WorktreePool
+
+        pool_size = 2
+        base_dir = tmp_path.resolve() / "worktrees"
+        repo_root = Path(__file__).resolve().parents[1]
+        test_file = repo_root / "codeflash" / "__init__.py"
+
+        async def _run() -> None:
+            with patch("codeflash.code_utils.worktree_pool.git_root_dir", return_value=repo_root):
+                async with WorktreePool(pool_size=pool_size, base_dir=base_dir) as pool:
+                    slot_a = await pool.acquire()
+                    slot_b = await pool.acquire()
+
+                    sentinel = "# SLOT_A_SENTINEL_CONTENT\n"
+                    await slot_a.write_candidate(test_file, sentinel)
+
+                    # slot_b's mirror of the same file should NOT contain the sentinel
+                    mirrored_b = slot_b.mirror(test_file)
+                    content_b = mirrored_b.read_text(encoding="utf-8")
+                    assert sentinel not in content_b
+
+                    # slot_a's mirror should contain it
+                    mirrored_a = slot_a.mirror(test_file)
+                    content_a = mirrored_a.read_text(encoding="utf-8")
+                    assert content_a == sentinel
+
+                    # Main tree should be unaffected
+                    main_content = test_file.read_text(encoding="utf-8")
+                    assert sentinel not in main_content
+
+                    await pool.release(slot_a)
+                    await pool.release(slot_b)
+
+        anyio.run(_run)
+
+
+class TestAsyncExecuteTestSubprocess:
+    def test_runs_simple_command(self) -> None:
+        from codeflash.languages.python.test_runner import async_execute_test_subprocess
+
+        cwd = Path(__file__).resolve().parent
+
+        async def _run() -> subprocess.CompletedProcess[str]:
+            return await async_execute_test_subprocess(
+                cmd_list=[sys.executable, "-c", "print('hello world')"], cwd=cwd, env=None, timeout=30
+            )
+
+        result = anyio.run(_run)
+        assert result.returncode == 0
+        assert "hello world" in result.stdout
+
+    def test_captures_stderr(self) -> None:
+        from codeflash.languages.python.test_runner import async_execute_test_subprocess
+
+        cwd = Path(__file__).resolve().parent
+
+        async def _run() -> subprocess.CompletedProcess[str]:
+            return await async_execute_test_subprocess(
+                cmd_list=[sys.executable, "-c", "import sys; sys.stderr.write('err_msg\\n')"],
+                cwd=cwd,
+                env=None,
+                timeout=30,
+            )
+
+        result = anyio.run(_run)
+        assert "err_msg" in result.stderr
+
+    def test_timeout_raises(self) -> None:
+        from codeflash.languages.python.test_runner import async_execute_test_subprocess
+
+        cwd = Path(__file__).resolve().parent
+
+        async def _run() -> subprocess.CompletedProcess[str]:
+            return await async_execute_test_subprocess(
+                cmd_list=[sys.executable, "-c", "import time; time.sleep(60)"], cwd=cwd, env=None, timeout=1
+            )
+
+        with pytest.raises(subprocess.TimeoutExpired):
+            anyio.run(_run)
+
+
+class TestParallelCandidateEvaluator:
+    """Unit tests for the evaluator with mocked worktree operations."""
+
+    def _make_candidate_node(self, opt_id: str = "cand_1") -> CandidateNode:
+        from codeflash.models.models import CodeString, CodeStringsMarkdown, OptimizedCandidate
+        from codeflash.models.shared_types import OptimizedCandidateSource
+
+        source_code = CodeStringsMarkdown(code_strings=[CodeString(code="def f(): pass", file_path=Path("test.py"))])
+        candidate = OptimizedCandidate(
+            source_code=source_code,
+            explanation="test optimization",
+            optimization_id=opt_id,
+            source=OptimizedCandidateSource.OPTIMIZE,
+        )
+        return CandidateNode(candidate)
+
+    def _make_optimizer_mock(self, tmp_path: Path) -> MagicMock:
+        opt = MagicMock()
+        opt.function_to_optimize.file_path = str(tmp_path / "src" / "module.py")
+        opt.function_to_optimize_source_code = "def f(): pass"
+        opt.test_files.test_files = []
+        opt.args.project_root = str(tmp_path)
+        opt.test_cfg = MagicMock()
+        opt.get_test_env.return_value = {"PATH": "/usr/bin"}
+        opt.language_support.build_pytest_cmd.return_value = [sys.executable, "-m", "pytest"]
+        opt.replace_function_and_helpers_with_optimized_code.return_value = True
+        opt.write_code_and_helpers = MagicMock()
+        return opt
+
+    def test_code_replacement_failure_returns_eval_failure(self, tmp_path: Path) -> None:
+        opt = self._make_optimizer_mock(tmp_path)
+        opt.replace_function_and_helpers_with_optimized_code.return_value = False
+
+        node = self._make_candidate_node()
+        evaluator = ParallelCandidateEvaluator(opt, pool_size=1)
+
+        repo_root = Path(__file__).resolve().parents[1]
+
+        async def _run() -> list:  # type: ignore[type-arg]
+            with patch("codeflash.code_utils.worktree_pool.git_root_dir", return_value=repo_root):
+                return await evaluator.evaluate_candidates(
+                    candidates=[(node, 0, None)],
+                    code_context=MagicMock(),
+                    original_code_baseline=MagicMock(),
+                    original_helper_code={},
+                    file_path_to_helper_classes={},
+                )
+
+        results = anyio.run(_run)
+        assert len(results) == 1
+        _, result = results[0]
+        assert result is not None
+        assert not is_successful(result)
+        failure = result.failure()
+        assert isinstance(failure, EvalFailure)
+        assert "Code replacement failed" in failure.message
+        assert failure.diffs == []
+
+    def test_behavioral_mismatch_carries_diffs(self, tmp_path: Path) -> None:
+        from codeflash.models.models import TestDiff, TestDiffScope
+
+        opt = self._make_optimizer_mock(tmp_path)
+        (tmp_path / "src").mkdir(parents=True)
+        (tmp_path / "src" / "module.py").write_text("def f(): pass", encoding="utf-8")
+
+        node = self._make_candidate_node()
+        evaluator = ParallelCandidateEvaluator(opt, pool_size=1)
+
+        repo_root = Path(__file__).resolve().parents[1]
+        mock_diffs = [TestDiff(scope=TestDiffScope.DID_PASS, original_pass=True, candidate_pass=False)]
+
+        async def _run() -> list:  # type: ignore[type-arg]
+            with (
+                patch("codeflash.code_utils.worktree_pool.git_root_dir", return_value=repo_root),
+                patch.object(
+                    ParallelCandidateEvaluator,
+                    "_run_behavioral",
+                    return_value=Failure(EvalFailure(message="Behavioral mismatch: 1 diffs", diffs=mock_diffs)),  # type: ignore[arg-type]
+                ),
+            ):
+                return await evaluator.evaluate_candidates(
+                    candidates=[(node, 0, None)],
+                    code_context=MagicMock(),
+                    original_code_baseline=MagicMock(),
+                    original_helper_code={},
+                    file_path_to_helper_classes={},
+                )
+
+        results = anyio.run(_run)
+        _, result = results[0]
+        assert not is_successful(result)
+        failure = result.failure()
+        assert len(failure.diffs) == 1
+        assert failure.diffs[0].scope == TestDiffScope.DID_PASS
+
+    def test_successful_candidate_returns_result(self, tmp_path: Path) -> None:
+        from codeflash.optimization.parallel_evaluator import _BehavioralPass
+
+        opt = self._make_optimizer_mock(tmp_path)
+        (tmp_path / "src").mkdir(parents=True)
+        (tmp_path / "src" / "module.py").write_text("def f(): pass", encoding="utf-8")
+
+        node = self._make_candidate_node()
+        evaluator = ParallelCandidateEvaluator(opt, pool_size=1)
+
+        repo_root = Path(__file__).resolve().parents[1]
+        mock_result = MagicMock()
+        mock_result.best_test_runtime = 5000
+
+        async def mock_behavioral(self_eval: object, *args: object, **kwargs: object) -> Success:  # type: ignore[type-arg]
+            slot = MagicMock()
+            return Success(
+                _BehavioralPass(slot=slot, candidate_index=0, perf_test_files=[], test_env={}, pytest_cmd_list=[])
+            )
+
+        async def _run() -> list:  # type: ignore[type-arg]
+            with (
+                patch("codeflash.code_utils.worktree_pool.git_root_dir", return_value=repo_root),
+                patch.object(ParallelCandidateEvaluator, "_run_behavioral", mock_behavioral),
+                patch.object(ParallelCandidateEvaluator, "_benchmark_phase", return_value=Success(mock_result)),
+            ):
+                return await evaluator.evaluate_candidates(
+                    candidates=[(node, 0, None)],
+                    code_context=MagicMock(),
+                    original_code_baseline=MagicMock(),
+                    original_helper_code={},
+                    file_path_to_helper_classes={},
+                )
+
+        results = anyio.run(_run)
+        _, result = results[0]
+        assert is_successful(result)
+        assert result.unwrap().best_test_runtime == 5000
+
+    def test_multiple_candidates_evaluated_concurrently(self, tmp_path: Path) -> None:
+        from codeflash.optimization.parallel_evaluator import _BehavioralPass
+
+        opt = self._make_optimizer_mock(tmp_path)
+        (tmp_path / "src").mkdir(parents=True)
+        (tmp_path / "src" / "module.py").write_text("def f(): pass", encoding="utf-8")
+
+        nodes = [self._make_candidate_node(f"cand_{i}") for i in range(3)]
+        evaluator = ParallelCandidateEvaluator(opt, pool_size=3)
+
+        repo_root = Path(__file__).resolve().parents[1]
+        mock_result = MagicMock()
+        mock_result.best_test_runtime = 1000
+
+        behavioral_call_count = 0
+
+        async def mock_behavioral(self_eval: object, *args: object, **kwargs: object) -> Success:  # type: ignore[type-arg]
+            nonlocal behavioral_call_count
+            behavioral_call_count += 1
+            slot = MagicMock()
+            return Success(
+                _BehavioralPass(slot=slot, candidate_index=0, perf_test_files=[], test_env={}, pytest_cmd_list=[])
+            )
+
+        benchmark_call_count = 0
+
+        async def mock_benchmark(self_eval: object, *args: object, **kwargs: object) -> Success:  # type: ignore[type-arg]
+            nonlocal benchmark_call_count
+            benchmark_call_count += 1
+            return Success(mock_result)
+
+        async def _run() -> list:  # type: ignore[type-arg]
+            with (
+                patch("codeflash.code_utils.worktree_pool.git_root_dir", return_value=repo_root),
+                patch.object(ParallelCandidateEvaluator, "_run_behavioral", mock_behavioral),
+                patch.object(ParallelCandidateEvaluator, "_benchmark_phase", mock_benchmark),
+            ):
+                return await evaluator.evaluate_candidates(
+                    candidates=[(n, i, None) for i, n in enumerate(nodes)],
+                    code_context=MagicMock(),
+                    original_code_baseline=MagicMock(),
+                    original_helper_code={},
+                    file_path_to_helper_classes={},
+                )
+
+        results = anyio.run(_run)
+        assert len(results) == 3
+        assert behavioral_call_count == 3
+        assert benchmark_call_count == 3
+        for _, result in results:
+            assert is_successful(result)