Merge branch 'main' of github.com:codeflash-ai/codeflash into optimization-effort

mohammedahmed18 · mohammedahmed18 · commit a040940216f9 · 2026-01-02T01:29:14.000+02:00
diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py
@@ -4,6 +4,7 @@
 import os
 import platform
 import time
+from itertools import count
 from typing import TYPE_CHECKING, Any, cast
 
 import requests
@@ -39,6 +40,11 @@ class AiServiceClient:
     def __init__(self) -> None:
         self.base_url = self.get_aiservice_base_url()
         self.headers = {"Authorization": f"Bearer {get_codeflash_api_key()}", "Connection": "close"}
+        self.llm_call_counter = count(1)
+
+    def get_next_sequence(self) -> int:
+        """Get the next LLM call sequence number."""
+        return next(self.llm_call_counter)
 
     def get_aiservice_base_url(self) -> str:
         if os.environ.get("CODEFLASH_AIS_SERVER", default="prod").lower() == "local":
@@ -105,6 +111,7 @@ def _get_valid_candidates(
                     optimization_id=opt["optimization_id"],
                     source=source,
                     parent_id=opt.get("parent_id", None),
+                    model=opt.get("model"),
                 )
             )
         return candidates
@@ -114,7 +121,6 @@ def optimize_python_code(  # noqa: D417
         source_code: str,
         dependency_code: str,
         trace_id: str,
-        num_candidates: int = 10,
         experiment_metadata: ExperimentMetadata | None = None,
         *,
         is_async: bool = False,
@@ -126,21 +132,22 @@ def optimize_python_code(  # noqa: D417
         - source_code (str): The python code to optimize.
         - dependency_code (str): The dependency code used as read-only context for the optimization
         - trace_id (str): Trace id of optimization run
-        - num_candidates (int): Number of optimization variants to generate. Default is 10.
         - experiment_metadata (Optional[ExperimentalMetadata, None]): Any available experiment metadata for this optimization
+        - is_async (bool): Whether the function being optimized is async
 
         Returns
         -------
         - List[OptimizationCandidate]: A list of Optimization Candidates.
 
         """
+        logger.info("Generating optimized candidates…")
+        console.rule()
         start_time = time.perf_counter()
         git_repo_owner, git_repo_name = safe_get_repo_owner_and_name()
 
         payload = {
             "source_code": source_code,
             "dependency_code": dependency_code,
-            "n_candidates": num_candidates,
             "trace_id": trace_id,
             "python_version": platform.python_version(),
             "experiment_metadata": experiment_metadata,
@@ -149,22 +156,25 @@ def optimize_python_code(  # noqa: D417
             "repo_owner": git_repo_owner,
             "repo_name": git_repo_name,
             "is_async": is_async,
+            "lsp_mode": is_LSP_enabled(),
+            "call_sequence": self.get_next_sequence(),
         }
+        logger.debug(f"Sending optimize request: trace_id={trace_id}, lsp_mode={payload['lsp_mode']}")
 
-        logger.info("!lsp|Generating optimized candidates…")
-        console.rule()
         try:
             response = self.make_ai_service_request("/optimize", payload=payload, timeout=60)
         except requests.exceptions.RequestException as e:
             logger.exception(f"Error generating optimized candidates: {e}")
             ph("cli-optimize-error-caught", {"error": str(e)})
+            console.rule()
             return []
 
         if response.status_code == 200:
             optimizations_json = response.json()["optimizations"]
-            console.rule()
             end_time = time.perf_counter()
             logger.debug(f"!lsp|Generating possible optimizations took {end_time - start_time:.2f} seconds.")
+            logger.info(f"!lsp|Received {len(optimizations_json)} optimization candidates.")
+            console.rule()
             return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE)
         try:
             error = response.json()["error"]
@@ -184,21 +194,28 @@ def optimize_python_code_line_profiler(  # noqa: D417
         num_candidates: int = 8,
         experiment_metadata: ExperimentMetadata | None = None,
     ) -> list[OptimizedCandidate]:
-        """Optimize the given python code for performance by making a request to the Django endpoint.
+        """Optimize the given python code for performance using line profiler results.
 
         Parameters
         ----------
         - source_code (str): The python code to optimize.
         - dependency_code (str): The dependency code used as read-only context for the optimization
         - trace_id (str): Trace id of optimization run
-        - num_candidates (int): Number of optimization variants to generate. Default is 10.
+        - line_profiler_results (str): Line profiler output to guide optimization
         - experiment_metadata (Optional[ExperimentalMetadata, None]): Any available experiment metadata for this optimization
 
         Returns
         -------
         - List[OptimizationCandidate]: A list of Optimization Candidates.
 
         """
+        if line_profiler_results == "":
+            logger.info("No LineProfiler results were provided, Skipping optimization.")
+            return []
+
+        logger.info("Generating optimized candidates with line profiler…")
+        console.rule()
+
         payload = {
             "source_code": source_code,
             "dependency_code": dependency_code,
@@ -209,25 +226,20 @@ def optimize_python_code_line_profiler(  # noqa: D417
             "experiment_metadata": experiment_metadata,
             "codeflash_version": codeflash_version,
             "lsp_mode": is_LSP_enabled(),
+            "call_sequence": self.get_next_sequence(),
         }
 
-        console.rule()
-        if line_profiler_results == "":
-            logger.info("No LineProfiler results were provided, Skipping optimization.")
-            console.rule()
-            return []
         try:
             response = self.make_ai_service_request("/optimize-line-profiler", payload=payload, timeout=60)
         except requests.exceptions.RequestException as e:
             logger.exception(f"Error generating optimized candidates: {e}")
             ph("cli-optimize-error-caught", {"error": str(e)})
+            console.rule()
             return []
 
         if response.status_code == 200:
             optimizations_json = response.json()["optimizations"]
-            logger.info(
-                f"!lsp|Generated {len(optimizations_json)} candidate optimizations using line profiler information."
-            )
+            logger.info(f"!lsp|Received {len(optimizations_json)} line profiler optimization candidates.")
             console.rule()
             return self._get_valid_candidates(optimizations_json, OptimizedCandidateSource.OPTIMIZE_LP)
         try:
@@ -265,6 +277,7 @@ def optimize_python_code_refinement(self, request: list[AIServiceRefinerRequest]
                 "trace_id": opt.trace_id,
                 "function_references": opt.function_references,
                 "python_version": platform.python_version(),
+                "call_sequence": self.get_next_sequence(),
             }
             for opt in request
         ]
@@ -399,6 +412,7 @@ def get_new_explanation(  # noqa: D417
             "throughput_improvement": throughput_improvement,
             "function_references": function_references,
             "codeflash_version": codeflash_version,
+            "call_sequence": self.get_next_sequence(),
         }
         logger.info("loading|Generating explanation")
         console.rule()
@@ -561,6 +575,7 @@ def generate_regression_tests(  # noqa: D417
             "python_version": platform.python_version(),
             "codeflash_version": codeflash_version,
             "is_async": function_to_optimize.is_async,
+            "call_sequence": self.get_next_sequence(),
         }
         try:
             response = self.make_ai_service_request("/testgen", payload=payload, timeout=90)
@@ -647,6 +662,7 @@ def get_optimization_review(
             "codeflash_version": codeflash_version,
             "calling_fn_details": calling_fn_details,
             "python_version": platform.python_version(),
+            "call_sequence": self.get_next_sequence(),
         }
         console.rule()
         try:
diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py
@@ -13,6 +13,13 @@
 REPEAT_OPTIMIZATION_PROBABILITY = 0.1
 DEFAULT_IMPORTANCE_THRESHOLD = 0.001
 
+# pytest loop stability
+# For now, we use strict thresholds (large windows and low tolerances), since this is still experimental.
+STABILITY_WINDOW_SIZE = 0.35  # 35% of total window
+STABILITY_CENTER_TOLERANCE = 0.0025  # ±0.25% around median
+STABILITY_SPREAD_TOLERANCE = 0.0025  # 0.25% window spread
+
+# Refinement
 REFINED_CANDIDATE_RANKING_WEIGHTS = (2, 1)  # (runtime, diff), runtime is more important than diff by a factor of 2
 
 # LSP-specific
diff --git a/codeflash/code_utils/env_utils.py b/codeflash/code_utils/env_utils.py
@@ -19,7 +19,6 @@
 def check_formatter_installed(formatter_cmds: list[str], exit_on_failure: bool = True) -> bool:  # noqa
     if not formatter_cmds or formatter_cmds[0] == "disabled":
         return True
-
     first_cmd = formatter_cmds[0]
     cmd_tokens = shlex.split(first_cmd) if isinstance(first_cmd, str) else [first_cmd]
 
diff --git a/codeflash/code_utils/formatter.py b/codeflash/code_utils/formatter.py
@@ -46,18 +46,13 @@ def apply_formatter_cmds(
     print_status: bool,  # noqa
     exit_on_failure: bool = True,  # noqa
 ) -> tuple[Path, str, bool]:
-    should_make_copy = False
-    file_path = path
-
-    if test_dir_str:
-        should_make_copy = True
-        file_path = Path(test_dir_str) / "temp.py"
-
     if not path.exists():
         msg = f"File {path} does not exist. Cannot apply formatter commands."
         raise FileNotFoundError(msg)
 
-    if should_make_copy:
+    file_path = path
+    if test_dir_str:
+        file_path = Path(test_dir_str) / "temp.py"
         shutil.copy2(path, file_path)
 
     file_token = "$file"  # noqa: S105
diff --git a/codeflash/discovery/discover_unit_tests.py b/codeflash/discovery/discover_unit_tests.py
@@ -751,6 +751,7 @@ def process_test_files(
 
     tests_cache = TestsCache(project_root_path)
     logger.info("!lsp|Discovering tests and processing unit tests")
+    console.rule()
     with test_files_progress_bar(total=len(file_to_test_map), description="Processing test files") as (
         progress,
         task_id,
diff --git a/codeflash/models/models.py b/codeflash/models/models.py
@@ -46,6 +46,7 @@ class AIServiceRefinerRequest:
     original_line_profiler_results: str
     optimized_line_profiler_results: str
     function_references: str | None = None
+    call_sequence: int | None = None
 
 
 class TestDiffScope(str, Enum):
@@ -464,6 +465,7 @@ class OptimizedCandidate:
     optimization_id: str
     source: OptimizedCandidateSource
     parent_id: str | None = None
+    model: str | None = None  # Which LLM model generated this candidate
 
 
 @dataclass(frozen=True)
diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py
@@ -143,6 +143,7 @@ def __init__(
         self.ai_service_client = ai_service_client
         self.executor = executor
         self.effort = effort
+        self.refinement_calls_count = 0
 
         # Initialize queue with initial candidates
         for candidate in initial_candidates:
@@ -152,6 +153,9 @@ def __init__(
         self.all_refinements_data = all_refinements_data
         self.future_all_code_repair = future_all_code_repair
 
+    def get_total_llm_calls(self) -> int:
+        return self.refinement_calls_count
+
     def get_next_candidate(self) -> OptimizedCandidate | None:
         """Get the next candidate from the queue, handling async results as needed."""
         try:
@@ -196,11 +200,13 @@ def _process_refinement_results(self) -> OptimizedCandidate | None:
                 len(self.all_refinements_data),
             )
         )
+        refinement_call_index = 0
 
         if top_n_candidates == len(self.all_refinements_data):
             # if we'll refine all candidates, we can skip the ranking and just refine them all
             for data in self.all_refinements_data:
-                future_refinements.append(self.refine_optimizations([data]))  # noqa: PERF401
+                refinement_call_index += 1
+                future_refinements.append(self.refine_optimizations([data]))
         else:
             diff_lens_list = []
             runtimes_list = []
@@ -218,9 +224,13 @@ def _process_refinement_results(self) -> OptimizedCandidate | None:
             top_indecies = sorted(score_dict, key=score_dict.get)[:top_n_candidates]
 
             for idx in top_indecies:
+                refinement_call_index += 1
                 data = self.all_refinements_data[idx]
                 future_refinements.append(self.refine_optimizations([data]))
 
+        # Track total refinement calls made
+        self.refinement_calls_count = refinement_call_index
+
         if future_refinements:
             logger.info("loading|Refining generated code for improved quality and performance...")
 
@@ -240,6 +250,7 @@ def _process_refinement_results(self) -> OptimizedCandidate | None:
             logger.info(
                 f"Added {len(refinement_response)} candidates from refinement, total candidates now: {self.candidate_len}"
             )
+            console.rule()
         self.refinement_done = True
 
         return self.get_next_candidate()
@@ -325,7 +336,7 @@ def __init__(
 
     def can_be_optimized(self) -> Result[tuple[bool, CodeOptimizationContext, dict[Path, str]], str]:
         should_run_experiment = self.experiment_id is not None
-        logger.debug(f"Function Trace ID: {self.function_trace_id}")
+        logger.info(f"Function Trace ID: {self.function_trace_id}")
         ph("cli-optimize-function-start", {"function_trace_id": self.function_trace_id})
         self.cleanup_leftover_test_return_values()
         file_name_from_test_module_name.cache_clear()
@@ -1210,7 +1221,6 @@ def instrument_existing_tests(self, function_to_all_tests: dict[str, set[Functio
         func_qualname = self.function_to_optimize.qualified_name_with_modules_from_root(self.project_root)
         if func_qualname not in function_to_all_tests:
             logger.info(f"Did not find any pre-existing tests for '{func_qualname}', will only use generated tests.")
-            console.rule()
         else:
             test_file_invocation_positions = defaultdict(list)
             for tests_in_file in function_to_all_tests.get(func_qualname):
@@ -1350,7 +1360,8 @@ def generate_tests(
         if concolic_test_str:
             count_tests += 1
 
-        logger.info(f"!lsp|Generated '{count_tests}' tests for '{self.function_to_optimize.function_name}'")
+        logger.info(f"!lsp|Generated {count_tests} tests for '{self.function_to_optimize.function_name}'")
+        console.rule()
 
         generated_tests = GeneratedTestsList(generated_tests=tests)
         return Success((count_tests, generated_tests, function_to_concolic_tests, concolic_test_str))
@@ -1361,15 +1372,13 @@ def generate_optimizations(
         read_only_context_code: str,
         run_experiment: bool = False,  # noqa: FBT001, FBT002
     ) -> Result[tuple[OptimizationSet, str], str]:
-        """Generate optimization candidates for the function."""
-        n_candidates = get_effort_value(EffortKeys.N_OPTIMIZER_CANDIDATES, self.args.effort)
-
+        """Generate optimization candidates for the function. Backend handles multi-model diversity."""
+        # n_candidates = get_effort_value(EffortKeys.N_OPTIMIZER_CANDIDATES, self.args.effort)
         future_optimization_candidates = self.executor.submit(
             self.aiservice_client.optimize_python_code,
             read_writable_code.markdown,
             read_only_context_code,
             self.function_trace_id[:-4] + "EXP0" if run_experiment else self.function_trace_id,
-            n_candidates,
             ExperimentMetadata(id=self.experiment_id, group="control") if run_experiment else None,
             is_async=self.function_to_optimize.is_async,
         )
@@ -1392,7 +1401,6 @@ def generate_optimizations(
                 read_writable_code.markdown,
                 read_only_context_code,
                 self.function_trace_id[:-4] + "EXP1",
-                n_candidates,
                 ExperimentMetadata(id=self.experiment_id, group="experiment"),
                 is_async=self.function_to_optimize.is_async,
             )
@@ -1401,14 +1409,16 @@ def generate_optimizations(
         # Wait for optimization futures to complete
         concurrent.futures.wait(futures)
 
-        # Retrieve results
-        candidates: list[OptimizedCandidate] = future_optimization_candidates.result()
-        logger.info(f"!lsp|Generated '{len(candidates)}' candidate optimizations.")
+        # Retrieve results - optimize_python_code returns list of candidates
+        candidates = future_optimization_candidates.result()
 
         if not candidates:
             return Failure(f"/!\\ NO OPTIMIZATIONS GENERATED for {self.function_to_optimize.function_name}")
 
-        candidates_experiment = future_candidates_exp.result() if future_candidates_exp else None
+        # Handle experiment results
+        candidates_experiment = None
+        if future_candidates_exp:
+            candidates_experiment = future_candidates_exp.result()
         function_references = future_references.result()
 
         return Success((OptimizationSet(control=candidates, experiment=candidates_experiment), function_references))
@@ -1895,7 +1905,6 @@ def establish_original_code_baseline(
                 benchmarking_results, self.function_to_optimize.function_name
             )
             logger.debug(f"Original async function throughput: {async_throughput} calls/second")
-            console.rule()
 
         if self.args.benchmark:
             replay_benchmarking_test_results = benchmarking_results.group_by_benchmarks(
@@ -2029,6 +2038,7 @@ def run_optimized_candidate(
                 return self.get_results_not_matched_error()
 
             logger.info(f"loading|Running performance tests for candidate {optimization_candidate_index}...")
+            console.rule()
 
             # For async functions, instrument at definition site for performance benchmarking
             if self.function_to_optimize.is_async:
diff --git a/codeflash/verification/pytest_plugin.py b/codeflash/verification/pytest_plugin.py
diff --git a/codeflash/verification/test_runner.py b/codeflash/verification/test_runner.py