|
42 | 42 | extract_unique_errors, |
43 | 43 | file_name_from_test_module_name, |
44 | 44 | get_run_tmp_file, |
| 45 | + module_name_from_file_path, |
45 | 46 | normalize_by_max, |
46 | 47 | restore_conftest, |
47 | 48 | unified_diff_strings, |
48 | 49 | ) |
49 | 50 | from codeflash.code_utils.config_consts import ( |
50 | 51 | COVERAGE_THRESHOLD, |
51 | 52 | INDIVIDUAL_TESTCASE_TIMEOUT, |
| 53 | + MAX_TEST_REPAIR_CYCLES, |
52 | 54 | MIN_CORRECT_CANDIDATES, |
53 | 55 | OPTIMIZATION_CONTEXT_TOKEN_LIMIT, |
54 | 56 | REFINED_CANDIDATE_RANKING_WEIGHTS, |
@@ -763,6 +765,17 @@ def optimize_function(self) -> Result[BestOptimization, str]: |
763 | 765 |
|
764 | 766 | optimizations_set, function_references = optimization_result.unwrap() |
765 | 767 |
|
| 768 | + review_result = self.review_and_repair_tests( |
| 769 | + generated_tests=generated_tests, |
| 770 | + code_context=code_context, |
| 771 | + original_helper_code=original_helper_code, |
| 772 | + ) |
| 773 | + if not is_successful(review_result): |
| 774 | + return Failure(review_result.failure()) |
| 775 | + |
| 776 | + generated_tests = review_result.unwrap() |
| 777 | + |
| 778 | + # Full baseline (behavioral + benchmarking) runs once on the final approved tests |
766 | 779 | baseline_setup_result = self.setup_and_establish_baseline( |
767 | 780 | code_context=code_context, |
768 | 781 | original_helper_code=original_helper_code, |
@@ -1885,6 +1898,135 @@ def setup_and_establish_baseline( |
1885 | 1898 | ) |
1886 | 1899 | ) |
1887 | 1900 |
|
| 1901 | + def run_behavioral_validation( |
| 1902 | + self, |
| 1903 | + code_context: CodeOptimizationContext, |
| 1904 | + original_helper_code: dict[Path, str], |
| 1905 | + ) -> TestResults | None: |
| 1906 | + """Run behavioral tests only. Returns results or None if no tests ran.""" |
| 1907 | + file_path_to_helper_classes: dict[Path, set[str]] = defaultdict(set) |
| 1908 | + for function_source in code_context.helper_functions: |
| 1909 | + if ( |
| 1910 | + function_source.qualified_name != self.function_to_optimize.qualified_name |
| 1911 | + and "." in function_source.qualified_name |
| 1912 | + ): |
| 1913 | + file_path_to_helper_classes[function_source.file_path].add( |
| 1914 | + function_source.qualified_name.split(".")[0] |
| 1915 | + ) |
| 1916 | + |
| 1917 | + test_env = self.get_test_env(codeflash_loop_index=0, codeflash_test_iteration=0, codeflash_tracer_disable=1) |
| 1918 | + if self.function_to_optimize.is_async: |
| 1919 | + self.instrument_async_for_mode(TestingMode.BEHAVIOR) |
| 1920 | + try: |
| 1921 | + self.instrument_capture(file_path_to_helper_classes) |
| 1922 | + behavioral_results, _ = self.run_and_parse_tests( |
| 1923 | + testing_type=TestingMode.BEHAVIOR, |
| 1924 | + test_env=test_env, |
| 1925 | + test_files=self.test_files, |
| 1926 | + optimization_iteration=0, |
| 1927 | + testing_time=TOTAL_LOOPING_TIME_EFFECTIVE, |
| 1928 | + enable_coverage=False, |
| 1929 | + code_context=code_context, |
| 1930 | + ) |
| 1931 | + finally: |
| 1932 | + self.write_code_and_helpers( |
| 1933 | + self.function_to_optimize_source_code, original_helper_code, self.function_to_optimize.file_path |
| 1934 | + ) |
| 1935 | + return behavioral_results if behavioral_results else None |
| 1936 | + |
| 1937 | + def review_and_repair_tests( |
| 1938 | + self, |
| 1939 | + generated_tests: GeneratedTestsList, |
| 1940 | + code_context: CodeOptimizationContext, |
| 1941 | + original_helper_code: dict[Path, str], |
| 1942 | + ) -> Result[GeneratedTestsList, str]: |
| 1943 | + """Run behavioral tests, review quality per-function, repair flagged functions. |
| 1944 | +
|
| 1945 | + Flow (up to MAX_TEST_REPAIR_CYCLES): |
| 1946 | + behavioral → collect failures → AI review passing functions → repair flagged → loop |
| 1947 | + No benchmarking runs here — only behavioral validation. |
| 1948 | + """ |
| 1949 | + for cycle in range(MAX_TEST_REPAIR_CYCLES): |
| 1950 | + # 1. Run behavioral tests |
| 1951 | + behavioral_results = self.run_behavioral_validation(code_context, original_helper_code) |
| 1952 | + if behavioral_results is None: |
| 1953 | + return Failure("Generated tests failed behavioral validation.") |
| 1954 | + |
| 1955 | + # 2. Collect per-function failures grouped by behavior file path |
| 1956 | + failed_by_file: dict[Path, list[str]] = defaultdict(list) |
| 1957 | + for result in behavioral_results.test_results: |
| 1958 | + if result.test_type == TestType.GENERATED_REGRESSION and not result.did_pass: |
| 1959 | + failed_by_file[result.file_name].append(result.id.test_function_name) |
| 1960 | + |
| 1961 | + # 3. Build review request with failed functions pre-flagged |
| 1962 | + tests_for_review = [] |
| 1963 | + for i, gt in enumerate(generated_tests.generated_tests): |
| 1964 | + failed_fns = failed_by_file.get(gt.behavior_file_path, []) |
| 1965 | + tests_for_review.append({ |
| 1966 | + "test_source": gt.generated_original_test_source, |
| 1967 | + "test_index": i, |
| 1968 | + "failed_test_functions": failed_fns, |
| 1969 | + }) |
| 1970 | + |
| 1971 | + review_results = self.aiservice_client.review_generated_tests( |
| 1972 | + tests=tests_for_review, |
| 1973 | + function_source_code=self.function_to_optimize_source_code, |
| 1974 | + function_name=self.function_to_optimize.function_name, |
| 1975 | + trace_id=self.function_trace_id, |
| 1976 | + language=self.function_to_optimize.language, |
| 1977 | + ) |
| 1978 | + |
| 1979 | + # 4. Repair test files that have flagged functions |
| 1980 | + any_repaired = False |
| 1981 | + for review in review_results: |
| 1982 | + if not review.functions_to_repair: |
| 1983 | + continue |
| 1984 | + |
| 1985 | + gt = generated_tests.generated_tests[review.test_index] |
| 1986 | + fn_names = ", ".join(f.function_name for f in review.functions_to_repair) |
| 1987 | + logger.info(f"Repairing test functions in test {review.test_index} (cycle {cycle + 1}): {fn_names}") |
| 1988 | + ph("cli-testgen-repair", { |
| 1989 | + "test_index": review.test_index, |
| 1990 | + "cycle": cycle + 1, |
| 1991 | + "functions": [f.function_name for f in review.functions_to_repair], |
| 1992 | + }) |
| 1993 | + |
| 1994 | + test_module_path = Path( |
| 1995 | + module_name_from_file_path(gt.behavior_file_path, self.test_cfg.tests_project_rootdir) |
| 1996 | + ) |
| 1997 | + repair_result = self.aiservice_client.repair_generated_tests( |
| 1998 | + test_source=gt.generated_original_test_source, |
| 1999 | + functions_to_repair=review.functions_to_repair, |
| 2000 | + function_source_code=self.function_to_optimize_source_code, |
| 2001 | + function_to_optimize=self.function_to_optimize, |
| 2002 | + helper_function_names=[], |
| 2003 | + module_path=Path(self.original_module_path), |
| 2004 | + test_module_path=test_module_path, |
| 2005 | + test_framework=self.test_cfg.test_framework, |
| 2006 | + test_timeout=INDIVIDUAL_TESTCASE_TIMEOUT, |
| 2007 | + trace_id=self.function_trace_id, |
| 2008 | + language=self.function_to_optimize.language, |
| 2009 | + ) |
| 2010 | + |
| 2011 | + if repair_result is None: |
| 2012 | + logger.warning(f"Repair failed for test {review.test_index}, keeping original") |
| 2013 | + continue |
| 2014 | + |
| 2015 | + repaired_source, behavior_source, perf_source = repair_result |
| 2016 | + gt.generated_original_test_source = repaired_source |
| 2017 | + gt.instrumented_behavior_test_source = behavior_source |
| 2018 | + gt.instrumented_perf_test_source = perf_source |
| 2019 | + |
| 2020 | + gt.behavior_file_path.write_text(behavior_source, encoding="utf8") |
| 2021 | + gt.perf_file_path.write_text(perf_source, encoding="utf8") |
| 2022 | + any_repaired = True |
| 2023 | + |
| 2024 | + # Nothing needed repair — tests are good |
| 2025 | + if not any_repaired: |
| 2026 | + break |
| 2027 | + |
| 2028 | + return Success(generated_tests) |
| 2029 | + |
1888 | 2030 | def find_and_process_best_optimization( |
1889 | 2031 | self, |
1890 | 2032 | optimizations_set: OptimizationSet, |
|
0 commit comments