@@ -268,19 +268,28 @@ async def _run_in_worktree(
268268 await slot .write_candidate (module_abspath , helper_code )
269269
270270 # Copy instrumented test files into the worktree (they're runtime-generated, not in git)
271- test_files : list [str ] = []
271+ behavior_test_files : list [str ] = []
272+ perf_test_files : list [str ] = []
272273 for file in opt .test_files .test_files :
273274 src = file .instrumented_behavior_file_path
274275 if src .exists ():
275276 await slot .write_candidate (src , src .read_text (encoding = "utf-8" ))
276- mirrored_path = slot .mirror (src )
277- test_files .append (str (mirrored_path ))
277+ behavior_test_files .append (str (slot .mirror (src )))
278+
279+ if file .benchmarking_file_path and file .benchmarking_file_path .exists ():
280+ await slot .write_candidate (
281+ file .benchmarking_file_path , file .benchmarking_file_path .read_text (encoding = "utf-8" )
282+ )
283+ perf_test_files .append (str (slot .mirror (file .benchmarking_file_path )))
278284
279285 # Run behavioral tests in the worktree
280286 worktree_cwd = slot .path
281287 test_env = opt .get_test_env (
282288 codeflash_loop_index = 0 , codeflash_test_iteration = candidate_index , codeflash_tracer_disable = 1
283289 )
290+ # Override PYTHONPATH to point at the worktree so imports resolve to candidate code
291+ worktree_project_root = slot .mirror (Path (opt .args .project_root ))
292+ test_env ["PYTHONPATH" ] = str (worktree_project_root )
284293
285294 from codeflash .code_utils .compat import IS_POSIX , SAFE_SYS_EXECUTABLE
286295
@@ -305,7 +314,7 @@ async def _run_in_worktree(
305314 pytest_test_env = test_env .copy ()
306315 pytest_test_env ["PYTEST_PLUGINS" ] = "codeflash.verification.pytest_plugin"
307316
308- cmd = pytest_cmd_list + common_pytest_args + blocklist_args + result_args + test_files
317+ cmd = pytest_cmd_list + common_pytest_args + blocklist_args + result_args + behavior_test_files
309318
310319 try :
311320 behavior_result = await async_execute_test_subprocess (
@@ -325,8 +334,13 @@ async def _run_in_worktree(
325334 if not behavior_test_results .test_results :
326335 return Failure ("No behavioral test results" )
327336
328- # Compare results
329- match , diffs = opt .compare_candidate_results (original_code_baseline , behavior_test_results , candidate_index )
337+ # Compare pass/fail only — parse_test_xml doesn't capture return values (stored in SQLite),
338+ # so we can only verify that the same tests pass/fail as in the baseline.
339+ from codeflash .verification .equivalence import compare_test_results
340+
341+ match , diffs = compare_test_results (
342+ original_code_baseline .behavior_test_results , behavior_test_results , pass_fail_only = True
343+ )
330344
331345 if not match :
332346 return Failure (f"Behavioral mismatch: { len (diffs )} diffs" )
@@ -345,7 +359,7 @@ async def _run_in_worktree(
345359 f"--timeout={ INDIVIDUAL_TESTCASE_TIMEOUT } " ,
346360 ]
347361
348- perf_cmd = pytest_cmd_list + perf_pytest_args + blocklist_args + perf_result_args + test_files
362+ perf_cmd = pytest_cmd_list + perf_pytest_args + blocklist_args + perf_result_args + perf_test_files
349363
350364 try :
351365 await async_execute_test_subprocess (cmd_list = perf_cmd , cwd = worktree_cwd , env = pytest_test_env , timeout = 600 )
0 commit comments