feat: per-function test quality review and repair

KRRT7 · KRRT7 · commit 7613d768b2c1 · 2026-03-02T09:12:33.000-05:00
Add review + repair step between test generation and baseline:
- Run behavioral tests to identify failing test functions
- Send to /ai/testgen_review with failures pre-flagged, AI reviews
  passing functions for unrealistic patterns (cache warm-up, internal
  state manipulation, identical inputs)
- Repair flagged functions via /ai/testgen_repair
- Loop up to MAX_TEST_REPAIR_CYCLES (default 1)
- Full baseline (behavioral + benchmarking) runs once on final tests
diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py
@@ -20,9 +20,11 @@
 from codeflash.models.models import (
     AIServiceRefinerRequest,
     CodeStringsMarkdown,
+    FunctionRepairInfo,
     OptimizationReviewResult,
     OptimizedCandidate,
     OptimizedCandidateSource,
+    TestFileReview,
 )
 from codeflash.telemetry.posthog_cf import ph
 from codeflash.version import __version__ as codeflash_version
@@ -803,6 +805,98 @@ def generate_regression_tests(
             ph("cli-testgen-error-response", {"response_status_code": response.status_code, "error": response.text})
             return None
 
+    def review_generated_tests(
+        self,
+        tests: list[dict],
+        function_source_code: str,
+        function_name: str,
+        trace_id: str,
+        language: str = "python",
+    ) -> list[TestFileReview]:
+        payload = {
+            "tests": tests,
+            "function_source_code": function_source_code,
+            "function_name": function_name,
+            "trace_id": trace_id,
+            "language": language,
+            "codeflash_version": codeflash_version,
+            "call_sequence": self.get_next_sequence(),
+        }
+        try:
+            response = self.make_ai_service_request("/testgen_review", payload=payload, timeout=self.timeout)
+        except requests.exceptions.RequestException as e:
+            logger.exception(f"Error reviewing generated tests: {e}")
+            ph("cli-testgen-review-error-caught", {"error": str(e)})
+            return []
+
+        if response.status_code == 200:
+            data = response.json()
+            return [
+                TestFileReview(
+                    test_index=r["test_index"],
+                    functions_to_repair=[
+                        FunctionRepairInfo(function_name=f["function_name"], reason=f.get("reason", ""))
+                        for f in r.get("functions", [])
+                    ],
+                )
+                for r in data.get("reviews", [])
+            ]
+        try:
+            error = response.json()["error"]
+        except Exception:
+            error = response.text
+        logger.error(f"Error reviewing generated tests: {response.status_code} - {error}")
+        ph("cli-testgen-review-error-response", {"response_status_code": response.status_code, "error": error})
+        return []
+
+    def repair_generated_tests(
+        self,
+        test_source: str,
+        functions_to_repair: list[FunctionRepairInfo],
+        function_source_code: str,
+        function_to_optimize: FunctionToOptimize,
+        helper_function_names: list[str],
+        module_path: Path,
+        test_module_path: Path,
+        test_framework: str,
+        test_timeout: int,
+        trace_id: str,
+        language: str = "python",
+    ) -> tuple[str, str, str] | None:
+        payload: dict[str, Any] = {
+            "test_source": test_source,
+            "functions_to_repair": [{"function_name": f.function_name, "reason": f.reason} for f in functions_to_repair],
+            "function_source_code": function_source_code,
+            "function_to_optimize": function_to_optimize,
+            "helper_function_names": helper_function_names,
+            "module_path": module_path,
+            "test_module_path": test_module_path,
+            "test_framework": test_framework,
+            "test_timeout": test_timeout,
+            "trace_id": trace_id,
+            "language": language,
+            "python_version": platform.python_version(),
+            "codeflash_version": codeflash_version,
+            "call_sequence": self.get_next_sequence(),
+        }
+        try:
+            response = self.make_ai_service_request("/testgen_repair", payload=payload, timeout=self.timeout)
+        except requests.exceptions.RequestException as e:
+            logger.exception(f"Error repairing generated tests: {e}")
+            ph("cli-testgen-repair-error-caught", {"error": str(e)})
+            return None
+
+        if response.status_code == 200:
+            data = response.json()
+            return (data["generated_tests"], data["instrumented_behavior_tests"], data["instrumented_perf_tests"])
+        try:
+            error = response.json()["error"]
+        except Exception:
+            error = response.text
+        logger.error(f"Error repairing generated tests: {response.status_code} - {error}")
+        ph("cli-testgen-repair-error-response", {"response_status_code": response.status_code, "error": error})
+        return None
+
     def get_optimization_review(
         self,
         original_code: dict[Path, str],
diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py
@@ -21,6 +21,7 @@
 COVERAGE_THRESHOLD = 60.0
 MIN_TESTCASE_PASSED_THRESHOLD = 6
 REPEAT_OPTIMIZATION_PROBABILITY = 0.1
+MAX_TEST_REPAIR_CYCLES = 1
 DEFAULT_IMPORTANCE_THRESHOLD = 0.001
 
 # pytest loop stability
diff --git a/codeflash/models/models.py b/codeflash/models/models.py
@@ -115,6 +115,16 @@ class OptimizationReviewResult(NamedTuple):
     explanation: str
 
 
+class FunctionRepairInfo(NamedTuple):
+    function_name: str
+    reason: str
+
+
+class TestFileReview(NamedTuple):
+    test_index: int
+    functions_to_repair: list[FunctionRepairInfo]
+
+
 # If the method spam is in the class Ham, which is at the top level of the module eggs in the package foo, the fully
 # qualified name of the method is foo.eggs.Ham.spam, its qualified name is Ham.spam, and its name is spam. The full name
 # of the module is foo.eggs.
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
@@ -42,13 +42,15 @@
     extract_unique_errors,
     file_name_from_test_module_name,
     get_run_tmp_file,
+    module_name_from_file_path,
     normalize_by_max,
     restore_conftest,
     unified_diff_strings,
 )
 from codeflash.code_utils.config_consts import (
     COVERAGE_THRESHOLD,
     INDIVIDUAL_TESTCASE_TIMEOUT,
+    MAX_TEST_REPAIR_CYCLES,
     MIN_CORRECT_CANDIDATES,
     OPTIMIZATION_CONTEXT_TOKEN_LIMIT,
     REFINED_CANDIDATE_RANKING_WEIGHTS,
@@ -763,6 +765,17 @@ def optimize_function(self) -> Result[BestOptimization, str]:
 
         optimizations_set, function_references = optimization_result.unwrap()
 
+        review_result = self.review_and_repair_tests(
+            generated_tests=generated_tests,
+            code_context=code_context,
+            original_helper_code=original_helper_code,
+        )
+        if not is_successful(review_result):
+            return Failure(review_result.failure())
+
+        generated_tests = review_result.unwrap()
+
+        # Full baseline (behavioral + benchmarking) runs once on the final approved tests
         baseline_setup_result = self.setup_and_establish_baseline(
             code_context=code_context,
             original_helper_code=original_helper_code,
@@ -1885,6 +1898,135 @@ def setup_and_establish_baseline(
             )
         )
 
+    def run_behavioral_validation(
+        self,
+        code_context: CodeOptimizationContext,
+        original_helper_code: dict[Path, str],
+    ) -> TestResults | None:
+        """Run behavioral tests only. Returns results or None if no tests ran."""
+        file_path_to_helper_classes: dict[Path, set[str]] = defaultdict(set)
+        for function_source in code_context.helper_functions:
+            if (
+                function_source.qualified_name != self.function_to_optimize.qualified_name
+                and "." in function_source.qualified_name
+            ):
+                file_path_to_helper_classes[function_source.file_path].add(
+                    function_source.qualified_name.split(".")[0]
+                )
+
+        test_env = self.get_test_env(codeflash_loop_index=0, codeflash_test_iteration=0, codeflash_tracer_disable=1)
+        if self.function_to_optimize.is_async:
+            self.instrument_async_for_mode(TestingMode.BEHAVIOR)
+        try:
+            self.instrument_capture(file_path_to_helper_classes)
+            behavioral_results, _ = self.run_and_parse_tests(
+                testing_type=TestingMode.BEHAVIOR,
+                test_env=test_env,
+                test_files=self.test_files,
+                optimization_iteration=0,
+                testing_time=TOTAL_LOOPING_TIME_EFFECTIVE,
+                enable_coverage=False,
+                code_context=code_context,
+            )
+        finally:
+            self.write_code_and_helpers(
+                self.function_to_optimize_source_code, original_helper_code, self.function_to_optimize.file_path
+            )
+        return behavioral_results if behavioral_results else None
+
+    def review_and_repair_tests(
+        self,
+        generated_tests: GeneratedTestsList,
+        code_context: CodeOptimizationContext,
+        original_helper_code: dict[Path, str],
+    ) -> Result[GeneratedTestsList, str]:
+        """Run behavioral tests, review quality per-function, repair flagged functions.
+
+        Flow (up to MAX_TEST_REPAIR_CYCLES):
+          behavioral → collect failures → AI review passing functions → repair flagged → loop
+        No benchmarking runs here — only behavioral validation.
+        """
+        for cycle in range(MAX_TEST_REPAIR_CYCLES):
+            # 1. Run behavioral tests
+            behavioral_results = self.run_behavioral_validation(code_context, original_helper_code)
+            if behavioral_results is None:
+                return Failure("Generated tests failed behavioral validation.")
+
+            # 2. Collect per-function failures grouped by behavior file path
+            failed_by_file: dict[Path, list[str]] = defaultdict(list)
+            for result in behavioral_results.test_results:
+                if result.test_type == TestType.GENERATED_REGRESSION and not result.did_pass:
+                    failed_by_file[result.file_name].append(result.id.test_function_name)
+
+            # 3. Build review request with failed functions pre-flagged
+            tests_for_review = []
+            for i, gt in enumerate(generated_tests.generated_tests):
+                failed_fns = failed_by_file.get(gt.behavior_file_path, [])
+                tests_for_review.append({
+                    "test_source": gt.generated_original_test_source,
+                    "test_index": i,
+                    "failed_test_functions": failed_fns,
+                })
+
+            review_results = self.aiservice_client.review_generated_tests(
+                tests=tests_for_review,
+                function_source_code=self.function_to_optimize_source_code,
+                function_name=self.function_to_optimize.function_name,
+                trace_id=self.function_trace_id,
+                language=self.function_to_optimize.language,
+            )
+
+            # 4. Repair test files that have flagged functions
+            any_repaired = False
+            for review in review_results:
+                if not review.functions_to_repair:
+                    continue
+
+                gt = generated_tests.generated_tests[review.test_index]
+                fn_names = ", ".join(f.function_name for f in review.functions_to_repair)
+                logger.info(f"Repairing test functions in test {review.test_index} (cycle {cycle + 1}): {fn_names}")
+                ph("cli-testgen-repair", {
+                    "test_index": review.test_index,
+                    "cycle": cycle + 1,
+                    "functions": [f.function_name for f in review.functions_to_repair],
+                })
+
+                test_module_path = Path(
+                    module_name_from_file_path(gt.behavior_file_path, self.test_cfg.tests_project_rootdir)
+                )
+                repair_result = self.aiservice_client.repair_generated_tests(
+                    test_source=gt.generated_original_test_source,
+                    functions_to_repair=review.functions_to_repair,
+                    function_source_code=self.function_to_optimize_source_code,
+                    function_to_optimize=self.function_to_optimize,
+                    helper_function_names=[],
+                    module_path=Path(self.original_module_path),
+                    test_module_path=test_module_path,
+                    test_framework=self.test_cfg.test_framework,
+                    test_timeout=INDIVIDUAL_TESTCASE_TIMEOUT,
+                    trace_id=self.function_trace_id,
+                    language=self.function_to_optimize.language,
+                )
+
+                if repair_result is None:
+                    logger.warning(f"Repair failed for test {review.test_index}, keeping original")
+                    continue
+
+                repaired_source, behavior_source, perf_source = repair_result
+                gt.generated_original_test_source = repaired_source
+                gt.instrumented_behavior_test_source = behavior_source
+                gt.instrumented_perf_test_source = perf_source
+
+                gt.behavior_file_path.write_text(behavior_source, encoding="utf8")
+                gt.perf_file_path.write_text(perf_source, encoding="utf8")
+                any_repaired = True
+
+            # Nothing needed repair — tests are good
+            if not any_repaired:
+                break
+
+        return Success(generated_tests)
+
     def find_and_process_best_optimization(
         self,
         optimizations_set: OptimizationSet,