fix: use perf-instrumented test files for benchmarking, pass_fail_only comparison

KRRT7 · KRRT7 · commit ad64b14fd8c7 · 2026-05-06T16:17:24.000-05:00
Three fixes for the parallel evaluator:
1. Copy both behavioral AND performance instrumented files to worktrees
2. Use benchmarking_file_path for perf tests (captures timing in markers)
3. Use pass_fail_only=True for comparison since parse_test_xml doesn't
   capture return values (they're in SQLite, not XML)
4. Override PYTHONPATH to worktree root for correct module resolution
diff --git a/codeflash/optimization/parallel_evaluator.py b/codeflash/optimization/parallel_evaluator.py
@@ -268,19 +268,28 @@ async def _run_in_worktree(
             await slot.write_candidate(module_abspath, helper_code)
 
         # Copy instrumented test files into the worktree (they're runtime-generated, not in git)
-        test_files: list[str] = []
+        behavior_test_files: list[str] = []
+        perf_test_files: list[str] = []
         for file in opt.test_files.test_files:
             src = file.instrumented_behavior_file_path
             if src.exists():
                 await slot.write_candidate(src, src.read_text(encoding="utf-8"))
-            mirrored_path = slot.mirror(src)
-            test_files.append(str(mirrored_path))
+            behavior_test_files.append(str(slot.mirror(src)))
+
+            if file.benchmarking_file_path and file.benchmarking_file_path.exists():
+                await slot.write_candidate(
+                    file.benchmarking_file_path, file.benchmarking_file_path.read_text(encoding="utf-8")
+                )
+                perf_test_files.append(str(slot.mirror(file.benchmarking_file_path)))
 
         # Run behavioral tests in the worktree
         worktree_cwd = slot.path
         test_env = opt.get_test_env(
             codeflash_loop_index=0, codeflash_test_iteration=candidate_index, codeflash_tracer_disable=1
         )
+        # Override PYTHONPATH to point at the worktree so imports resolve to candidate code
+        worktree_project_root = slot.mirror(Path(opt.args.project_root))
+        test_env["PYTHONPATH"] = str(worktree_project_root)
 
         from codeflash.code_utils.compat import IS_POSIX, SAFE_SYS_EXECUTABLE
 
@@ -305,7 +314,7 @@ async def _run_in_worktree(
         pytest_test_env = test_env.copy()
         pytest_test_env["PYTEST_PLUGINS"] = "codeflash.verification.pytest_plugin"
 
-        cmd = pytest_cmd_list + common_pytest_args + blocklist_args + result_args + test_files
+        cmd = pytest_cmd_list + common_pytest_args + blocklist_args + result_args + behavior_test_files
 
         try:
             behavior_result = await async_execute_test_subprocess(
@@ -325,8 +334,13 @@ async def _run_in_worktree(
         if not behavior_test_results.test_results:
             return Failure("No behavioral test results")
 
-        # Compare results
-        match, diffs = opt.compare_candidate_results(original_code_baseline, behavior_test_results, candidate_index)
+        # Compare pass/fail only — parse_test_xml doesn't capture return values (stored in SQLite),
+        # so we can only verify that the same tests pass/fail as in the baseline.
+        from codeflash.verification.equivalence import compare_test_results
+
+        match, diffs = compare_test_results(
+            original_code_baseline.behavior_test_results, behavior_test_results, pass_fail_only=True
+        )
 
         if not match:
             return Failure(f"Behavioral mismatch: {len(diffs)} diffs")
@@ -345,7 +359,7 @@ async def _run_in_worktree(
             f"--timeout={INDIVIDUAL_TESTCASE_TIMEOUT}",
         ]
 
-        perf_cmd = pytest_cmd_list + perf_pytest_args + blocklist_args + perf_result_args + test_files
+        perf_cmd = pytest_cmd_list + perf_pytest_args + blocklist_args + perf_result_args + perf_test_files
 
         try:
             await async_execute_test_subprocess(cmd_list=perf_cmd, cwd=worktree_cwd, env=pytest_test_env, timeout=600)