test: add unit tests for ParallelCandidateEvaluator with mocked worktrees

KRRT7 · KRRT7 · commit 62ca5b6035dd · 2026-05-06T18:52:26.000-05:00
Tests cover: code replacement failure → EvalFailure, behavioral mismatch
carrying diffs, successful candidate routing, and concurrent multi-
candidate evaluation.
diff --git a/tests/test_parallel_evaluator.py b/tests/test_parallel_evaluator.py
@@ -5,10 +5,15 @@
 import subprocess
 import sys
 from pathlib import Path
+from unittest.mock import MagicMock, patch
 
 import anyio
 import pytest
 
+from codeflash.either import Failure, Success, is_successful
+from codeflash.languages.function_optimizer import CandidateNode
+from codeflash.optimization.parallel_evaluator import EvalFailure, ParallelCandidateEvaluator
+
 
 class TestWorktreePoolLifecycle:
     def test_creates_n_worktrees_and_cleans_up(self, tmp_path: Path) -> None:
@@ -115,10 +120,7 @@ def test_runs_simple_command(self) -> None:
 
         async def _run() -> subprocess.CompletedProcess[str]:
             return await async_execute_test_subprocess(
-                cmd_list=[sys.executable, "-c", "print('hello world')"],
-                cwd=cwd,
-                env=None,
-                timeout=30,
+                cmd_list=[sys.executable, "-c", "print('hello world')"], cwd=cwd, env=None, timeout=30
             )
 
         result = anyio.run(_run)
@@ -148,11 +150,172 @@ def test_timeout_raises(self) -> None:
 
         async def _run() -> subprocess.CompletedProcess[str]:
             return await async_execute_test_subprocess(
-                cmd_list=[sys.executable, "-c", "import time; time.sleep(60)"],
-                cwd=cwd,
-                env=None,
-                timeout=1,
+                cmd_list=[sys.executable, "-c", "import time; time.sleep(60)"], cwd=cwd, env=None, timeout=1
             )
 
         with pytest.raises(subprocess.TimeoutExpired):
             anyio.run(_run)
+
+
+class TestParallelCandidateEvaluator:
+    """Unit tests for the evaluator with mocked worktree operations."""
+
+    def _make_candidate_node(self, opt_id: str = "cand_1") -> CandidateNode:
+        from codeflash.models.models import CodeString, CodeStringsMarkdown, OptimizedCandidate
+        from codeflash.models.shared_types import OptimizedCandidateSource
+
+        source_code = CodeStringsMarkdown(code_strings=[CodeString(code="def f(): pass", file_path=Path("test.py"))])
+        candidate = OptimizedCandidate(
+            source_code=source_code,
+            explanation="test optimization",
+            optimization_id=opt_id,
+            source=OptimizedCandidateSource.OPTIMIZE,
+        )
+        return CandidateNode(candidate)
+
+    def _make_optimizer_mock(self, tmp_path: Path) -> MagicMock:
+        opt = MagicMock()
+        opt.function_to_optimize.file_path = str(tmp_path / "src" / "module.py")
+        opt.function_to_optimize_source_code = "def f(): pass"
+        opt.test_files.test_files = []
+        opt.args.project_root = str(tmp_path)
+        opt.test_cfg = MagicMock()
+        opt.get_test_env.return_value = {"PATH": "/usr/bin"}
+        opt.language_support.build_pytest_cmd.return_value = [sys.executable, "-m", "pytest"]
+        opt.replace_function_and_helpers_with_optimized_code.return_value = True
+        opt.write_code_and_helpers = MagicMock()
+        return opt
+
+    def test_code_replacement_failure_returns_eval_failure(self, tmp_path: Path) -> None:
+        opt = self._make_optimizer_mock(tmp_path)
+        opt.replace_function_and_helpers_with_optimized_code.return_value = False
+
+        node = self._make_candidate_node()
+        evaluator = ParallelCandidateEvaluator(opt, pool_size=1)
+
+        repo_root = Path(__file__).resolve().parents[1]
+
+        async def _run() -> list:  # type: ignore[type-arg]
+            with patch("codeflash.code_utils.worktree_pool.git_root_dir", return_value=repo_root):
+                return await evaluator.evaluate_candidates(
+                    candidates=[(node, 0, None)],
+                    code_context=MagicMock(),
+                    original_code_baseline=MagicMock(),
+                    original_helper_code={},
+                    file_path_to_helper_classes={},
+                )
+
+        results = anyio.run(_run)
+        assert len(results) == 1
+        _, result = results[0]
+        assert result is not None
+        assert not is_successful(result)
+        failure = result.failure()
+        assert isinstance(failure, EvalFailure)
+        assert "Code replacement failed" in failure.message
+        assert failure.diffs == []
+
+    def test_behavioral_mismatch_carries_diffs(self, tmp_path: Path) -> None:
+        from codeflash.models.models import TestDiff, TestDiffScope
+
+        opt = self._make_optimizer_mock(tmp_path)
+        (tmp_path / "src").mkdir(parents=True)
+        (tmp_path / "src" / "module.py").write_text("def f(): pass", encoding="utf-8")
+
+        node = self._make_candidate_node()
+        evaluator = ParallelCandidateEvaluator(opt, pool_size=1)
+
+        repo_root = Path(__file__).resolve().parents[1]
+        mock_diffs = [TestDiff(scope=TestDiffScope.DID_PASS, original_pass=True, candidate_pass=False)]
+
+        async def _run() -> list:  # type: ignore[type-arg]
+            with (
+                patch("codeflash.code_utils.worktree_pool.git_root_dir", return_value=repo_root),
+                patch.object(
+                    ParallelCandidateEvaluator,
+                    "_run_in_worktree",
+                    return_value=Failure(EvalFailure(message="Behavioral mismatch: 1 diffs", diffs=mock_diffs)),  # type: ignore[arg-type]
+                ),
+            ):
+                return await evaluator.evaluate_candidates(
+                    candidates=[(node, 0, None)],
+                    code_context=MagicMock(),
+                    original_code_baseline=MagicMock(),
+                    original_helper_code={},
+                    file_path_to_helper_classes={},
+                )
+
+        results = anyio.run(_run)
+        _, result = results[0]
+        assert not is_successful(result)
+        failure = result.failure()
+        assert len(failure.diffs) == 1
+        assert failure.diffs[0].scope == TestDiffScope.DID_PASS
+
+    def test_successful_candidate_returns_result(self, tmp_path: Path) -> None:
+        opt = self._make_optimizer_mock(tmp_path)
+        (tmp_path / "src").mkdir(parents=True)
+        (tmp_path / "src" / "module.py").write_text("def f(): pass", encoding="utf-8")
+
+        node = self._make_candidate_node()
+        evaluator = ParallelCandidateEvaluator(opt, pool_size=1)
+
+        repo_root = Path(__file__).resolve().parents[1]
+        mock_result = MagicMock()
+        mock_result.best_test_runtime = 5000
+
+        async def _run() -> list:  # type: ignore[type-arg]
+            with (
+                patch("codeflash.code_utils.worktree_pool.git_root_dir", return_value=repo_root),
+                patch.object(ParallelCandidateEvaluator, "_run_in_worktree", return_value=Success(mock_result)),
+            ):
+                return await evaluator.evaluate_candidates(
+                    candidates=[(node, 0, None)],
+                    code_context=MagicMock(),
+                    original_code_baseline=MagicMock(),
+                    original_helper_code={},
+                    file_path_to_helper_classes={},
+                )
+
+        results = anyio.run(_run)
+        _, result = results[0]
+        assert is_successful(result)
+        assert result.unwrap().best_test_runtime == 5000
+
+    def test_multiple_candidates_evaluated_concurrently(self, tmp_path: Path) -> None:
+        opt = self._make_optimizer_mock(tmp_path)
+        (tmp_path / "src").mkdir(parents=True)
+        (tmp_path / "src" / "module.py").write_text("def f(): pass", encoding="utf-8")
+
+        nodes = [self._make_candidate_node(f"cand_{i}") for i in range(3)]
+        evaluator = ParallelCandidateEvaluator(opt, pool_size=3)
+
+        repo_root = Path(__file__).resolve().parents[1]
+        mock_result = MagicMock()
+        mock_result.best_test_runtime = 1000
+
+        call_count = 0
+
+        async def mock_run_in_worktree(self_eval: object, *args: object, **kwargs: object) -> Success:  # type: ignore[type-arg]
+            nonlocal call_count
+            call_count += 1
+            return Success(mock_result)
+
+        async def _run() -> list:  # type: ignore[type-arg]
+            with (
+                patch("codeflash.code_utils.worktree_pool.git_root_dir", return_value=repo_root),
+                patch.object(ParallelCandidateEvaluator, "_run_in_worktree", mock_run_in_worktree),
+            ):
+                return await evaluator.evaluate_candidates(
+                    candidates=[(n, i, None) for i, n in enumerate(nodes)],
+                    code_context=MagicMock(),
+                    original_code_baseline=MagicMock(),
+                    original_helper_code={},
+                    file_path_to_helper_classes={},
+                )
+
+        results = anyio.run(_run)
+        assert len(results) == 3
+        assert call_count == 3
+        for _, result in results:
+            assert is_successful(result)