codeflash-ai
diff --git a/‎codeflash/benchmarking/compare.py‎
Lines changed: 789 additions & 216 deletions b/‎codeflash/benchmarking/compare.py‎
Lines changed: 789 additions & 216 deletions
diff --git a/‎codeflash/benchmarking/plugin/plugin.py‎
Lines changed: 232 additions & 113 deletions b/‎codeflash/benchmarking/plugin/plugin.py‎
Lines changed: 232 additions & 113 deletions
diff --git a/‎codeflash/benchmarking/pytest_new_process_memory_benchmarks.py‎
Lines changed: 42 additions & 0 deletions b/‎codeflash/benchmarking/pytest_new_process_memory_benchmarks.py‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎codeflash/benchmarking/trace_benchmarks.py‎
Lines changed: 36 additions & 0 deletions b/‎codeflash/benchmarking/trace_benchmarks.py‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎codeflash/benchmarking/utils.py‎
Lines changed: 13 additions & 8 deletions b/‎codeflash/benchmarking/utils.py‎
Lines changed: 13 additions & 8 deletions
diff --git a/‎codeflash/cli_cmds/cli.py‎
Lines changed: 14 additions & 1 deletion b/‎codeflash/cli_cmds/cli.py‎
Lines changed: 14 additions & 1 deletion
diff --git a/‎codeflash/cli_cmds/cmd_compare.py‎
Lines changed: 126 additions & 21 deletions b/‎codeflash/cli_cmds/cmd_compare.py‎
Lines changed: 126 additions & 21 deletions
diff --git a/‎codeflash/optimization/optimizer.py‎
Lines changed: 2 additions & 1 deletion b/‎codeflash/optimization/optimizer.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎pyproject.toml‎
Lines changed: 4 additions & 2 deletions b/‎pyproject.toml‎
Lines changed: 4 additions & 2 deletions
@@ -0,0 +1,42 @@
+"""Subprocess entry point for memory profiling benchmarks via pytest-memray.
+
+Runs pytest with --memray --native to profile peak memory per test function.
+The codeflash-benchmark plugin is left active (without --codeflash-trace) so it
+provides a no-op ``benchmark`` fixture for tests that depend on it.
+"""
+
+import sys
+from pathlib import Path
+
+benchmarks_root = sys.argv[1]
+memray_bin_dir = sys.argv[2]
+memray_bin_prefix = sys.argv[3]
+
+if __name__ == "__main__":
+    import pytest
+
+    Path(memray_bin_dir).mkdir(parents=True, exist_ok=True)
+
+    exitcode = pytest.main(
+        [
+            benchmarks_root,
+            "--memray",
+            "--native",
+            f"--memray-bin-path={memray_bin_dir}",
+            f"--memray-bin-prefix={memray_bin_prefix}",
+            "--hide-memray-summary",
+            "-p",
+            "no:benchmark",
+            "-p",
+            "no:codspeed",
+            "-p",
+            "no:cov",
+            "-p",
+            "no:profiling",
+            "-s",
+            "-o",
+            "addopts=",
+        ]
+    )
+
+    sys.exit(exitcode)
@@ -46,3 +46,39 @@ def trace_benchmarks_pytest(
             error_section = combined_output
         logger.warning(f"Error collecting benchmarks - Pytest Exit code: {result.returncode}, {error_section}")
         logger.debug(f"Full pytest output:\n{combined_output}")
+
+
+def memory_benchmarks_pytest(
+    benchmarks_root: Path, project_root: Path, memray_bin_dir: Path, memray_bin_prefix: str, timeout: int = 300
+) -> None:
+    benchmark_env = make_env_with_project_root(project_root)
+    run_args = get_cross_platform_subprocess_run_args(
+        cwd=project_root, env=benchmark_env, timeout=timeout, check=False, text=True, capture_output=True
+    )
+    result = subprocess.run(  # noqa: PLW1510
+        [
+            SAFE_SYS_EXECUTABLE,
+            Path(__file__).parent / "pytest_new_process_memory_benchmarks.py",
+            benchmarks_root,
+            memray_bin_dir,
+            memray_bin_prefix,
+        ],
+        **run_args,
+    )
+    if result.returncode != 0:
+        combined_output = result.stdout
+        if result.stderr:
+            combined_output = combined_output + "\n" + result.stderr if combined_output else result.stderr
+
+        if "ERROR collecting" in combined_output:
+            error_pattern = r"={3,}\s*ERRORS\s*={3,}\n([\s\S]*?)(?:={3,}|$)"
+            match = re.search(error_pattern, combined_output)
+            error_section = match.group(1) if match else combined_output
+        elif "FAILURES" in combined_output:
+            error_pattern = r"={3,}\s*FAILURES\s*={3,}\n([\s\S]*?)(?:={3,}|$)"
+            match = re.search(error_pattern, combined_output)
+            error_section = match.group(1) if match else combined_output
+        else:
+            error_section = combined_output
+        logger.warning(f"Error collecting memory benchmarks - Pytest Exit code: {result.returncode}, {error_section}")
+        logger.debug(f"Full pytest output:\n{combined_output}")
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
+import logging
 import shutil
+from operator import itemgetter
 from typing import TYPE_CHECKING, Optional
 
 from rich.console import Console
@@ -16,27 +18,30 @@
 
 
 def validate_and_format_benchmark_table(
-    function_benchmark_timings: dict[str, dict[BenchmarkKey, int]], total_benchmark_timings: dict[BenchmarkKey, int]
+    function_benchmark_timings: dict[str, dict[BenchmarkKey, float]], total_benchmark_timings: dict[BenchmarkKey, float]
 ) -> dict[str, list[tuple[BenchmarkKey, float, float, float]]]:
     function_to_result = {}
-    # Process each function's benchmark data
+    scale = 1_000_000.0
     for func_path, test_times in function_benchmark_timings.items():
         # Sort by percentage (highest first)
         sorted_tests = []
         for benchmark_key, func_time in test_times.items():
             total_time = total_benchmark_timings.get(benchmark_key, 0)
             if func_time > total_time:
-                logger.debug(f"Skipping test {benchmark_key} due to func_time {func_time} > total_time {total_time}")
                 # If the function time is greater than total time, likely to have multithreading / multiprocessing issues.
                 # Do not try to project the optimization impact for this function.
+                if logger.isEnabledFor(logging.DEBUG):
+                    logger.debug(
+                        f"Skipping test {benchmark_key} due to func_time {func_time} > total_time {total_time}"
+                    )
                 sorted_tests.append((benchmark_key, 0.0, 0.0, 0.0))
             elif total_time > 0:
                 percentage = (func_time / total_time) * 100
                 # Convert nanoseconds to milliseconds
-                func_time_ms = func_time / 1_000_000
-                total_time_ms = total_time / 1_000_000
+                func_time_ms = func_time / scale
+                total_time_ms = total_time / scale
                 sorted_tests.append((benchmark_key, total_time_ms, func_time_ms, percentage))
-        sorted_tests.sort(key=lambda x: x[3], reverse=True)
+        sorted_tests.sort(key=itemgetter(3), reverse=True)
         function_to_result[func_path] = sorted_tests
     return function_to_result
 
@@ -77,8 +82,8 @@ def print_benchmark_table(function_to_results: dict[str, list[tuple[BenchmarkKey
 
 def process_benchmark_data(
     replay_performance_gain: dict[BenchmarkKey, float],
-    fto_benchmark_timings: dict[BenchmarkKey, int],
-    total_benchmark_timings: dict[BenchmarkKey, int],
+    fto_benchmark_timings: dict[BenchmarkKey, float],
+    total_benchmark_timings: dict[BenchmarkKey, float],
 ) -> Optional[ProcessedBenchmarkInfo]:
     """Process benchmark data and generate detailed benchmark information.
 
 
@@ -383,13 +383,26 @@ def _build_parser() -> ArgumentParser:
     auth_subparsers.add_parser("status", help="Check authentication status")
 
     compare_parser = subparsers.add_parser("compare", help="Compare benchmark performance between two git refs.")
-    compare_parser.add_argument("base_ref", help="Base git ref (branch, tag, or commit)")
+    compare_parser.add_argument(
+        "base_ref", nargs="?", default=None, help="Base git ref (default: auto-detect from PR or default branch)"
+    )
     compare_parser.add_argument("head_ref", nargs="?", default=None, help="Head git ref (default: current branch)")
     compare_parser.add_argument("--pr", type=int, help="Resolve head ref from a PR number (requires gh CLI)")
     compare_parser.add_argument(
         "--functions", type=str, help="Explicit functions to instrument: 'file.py::func1,func2;other.py::func3'"
     )
     compare_parser.add_argument("--timeout", type=int, default=600, help="Benchmark timeout in seconds (default: 600)")
+    compare_parser.add_argument("--output", "-o", type=str, help="Write markdown report to file")
+    compare_parser.add_argument(
+        "--memory", action="store_true", help="Profile peak memory usage per benchmark (requires memray, Linux/macOS)"
+    )
+    compare_parser.add_argument("--script", type=str, help="Shell command to run as benchmark in each worktree")
+    compare_parser.add_argument(
+        "--script-output",
+        type=str,
+        dest="script_output",
+        help="Relative path to JSON results file produced by --script (required with --script)",
+    )
     compare_parser.add_argument("--config-file", type=str, dest="config_file", help="Path to pyproject.toml")
 
     trace_optimize.add_argument(
 
@@ -13,15 +13,73 @@
     from codeflash.models.function_types import FunctionToOptimize
 
 from codeflash.cli_cmds.console import logger
-from codeflash.code_utils.config_parser import parse_config_file
 
 
 def run_compare(args: Namespace) -> None:
     """Entry point for the compare subcommand."""
-    # Load project config
-    pyproject_config, pyproject_file_path = parse_config_file(args.config_file)
+    # Resolve head_ref: explicit arg > --pr > current branch
+    head_ref = args.head_ref
+    if args.pr:
+        head_ref = resolve_pr_branch(args.pr)
+    if not head_ref:
+        head_ref = get_current_branch()
+        if not head_ref:
+            logger.error("Must provide head_ref, --pr, or be on a branch")
+            sys.exit(1)
+        logger.info(f"Auto-detected head ref: {head_ref}")
+
+    # Resolve base_ref: explicit arg > PR base branch > repo default branch
+    base_ref = args.base_ref
+    if not base_ref:
+        base_ref = detect_base_ref(head_ref)
+        if not base_ref:
+            logger.error("Could not auto-detect base ref. Provide it explicitly or ensure gh CLI is available.")
+            sys.exit(1)
+        logger.info(f"Auto-detected base ref: {base_ref}")
+
+    # Script mode: run an arbitrary benchmark command on each worktree (no codeflash config needed)
+    script_cmd = getattr(args, "script", None)
+    if script_cmd:
+        script_output = getattr(args, "script_output", None)
+        if not script_output:
+            logger.error("--script-output is required when using --script")
+            sys.exit(1)
+
+        import git
+
+        project_root = Path(git.Repo(Path.cwd(), search_parent_directories=True).working_dir)
+
+        from codeflash.benchmarking.compare import compare_with_script
+
+        result = compare_with_script(
+            base_ref=base_ref,
+            head_ref=head_ref,
+            project_root=project_root,
+            script_cmd=script_cmd,
+            script_output=script_output,
+            timeout=args.timeout,
+            memory=getattr(args, "memory", False),
+        )
+
+        if not result.base_results and not result.head_results:
+            logger.warning("No benchmark data collected. Check that --script-output points to a valid JSON file.")
+            sys.exit(1)
 
+        if args.output:
+            md = result.format_markdown()
+            Path(args.output).write_text(md, encoding="utf-8")
+            logger.info(f"Markdown report written to {args.output}")
+        return
+
+    # Standard trace-benchmark mode: requires codeflash config
+    from codeflash.code_utils.config_parser import parse_config_file
+
+    pyproject_config, pyproject_file_path = parse_config_file(args.config_file)
     module_root = Path(pyproject_config.get("module_root", ".")).resolve()
+
+    from codeflash.cli_cmds.cli import project_root_from_module_root
+
+    project_root = project_root_from_module_root(module_root, pyproject_file_path)
     tests_root = Path(pyproject_config.get("tests_root", "tests")).resolve()
     benchmarks_root_str = pyproject_config.get("benchmarks_root")
 
@@ -34,42 +92,89 @@ def run_compare(args: Namespace) -> None:
         logger.error(f"benchmarks-root {benchmarks_root} is not a valid directory")
         sys.exit(1)
 
-    from codeflash.cli_cmds.cli import project_root_from_module_root
-
-    project_root = project_root_from_module_root(module_root, pyproject_file_path)
-
-    # Resolve head_ref
-    head_ref = args.head_ref
-    if args.pr:
-        head_ref = _resolve_pr_branch(args.pr)
-    if not head_ref:
-        logger.error("Must provide head_ref or --pr")
-        sys.exit(1)
-
     # Parse explicit functions if provided
     functions = None
     if args.functions:
-        functions = _parse_functions_arg(args.functions, project_root)
+        functions = parse_functions_arg(args.functions, project_root)
 
     from codeflash.benchmarking.compare import compare_branches
 
     result = compare_branches(
-        base_ref=args.base_ref,
+        base_ref=base_ref,
         head_ref=head_ref,
         project_root=project_root,
         benchmarks_root=benchmarks_root,
         tests_root=tests_root,
         functions=functions,
         timeout=args.timeout,
+        memory=getattr(args, "memory", False),
     )
 
-    if not result.base_total_ns and not result.head_total_ns:
+    if not result.base_stats and not result.head_stats:
         logger.warning("No benchmark data collected. Check that benchmarks-root is configured and benchmarks exist.")
         sys.exit(1)
 
+    if args.output:
+        md = result.format_markdown()
+        Path(args.output).write_text(md, encoding="utf-8")
+        logger.info(f"Markdown report written to {args.output}")
+
+
+def get_current_branch() -> str | None:
+    try:
+        result = subprocess.run(
+            ["git", "rev-parse", "--abbrev-ref", "HEAD"], capture_output=True, text=True, check=True
+        )
+        branch = result.stdout.strip()
+        return branch if branch and branch != "HEAD" else None
+    except (FileNotFoundError, subprocess.CalledProcessError):
+        return None
+
+
+def detect_base_ref(head_ref: str) -> str | None:
+    # Try to find an open PR for this branch and use its base
+    try:
+        result = subprocess.run(
+            ["gh", "pr", "view", head_ref, "--json", "baseRefName", "-q", ".baseRefName"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        base = result.stdout.strip()
+        if base:
+            return base
+    except (FileNotFoundError, subprocess.CalledProcessError):
+        pass
+
+    # Fall back to repo default branch
+    try:
+        result = subprocess.run(
+            ["gh", "repo", "view", "--json", "defaultBranchRef", "-q", ".defaultBranchRef.name"],
+            capture_output=True,
+            text=True,
+            check=True,
+        )
+        default = result.stdout.strip()
+        if default:
+            return default
+    except (FileNotFoundError, subprocess.CalledProcessError):
+        pass
+
+    # Last resort: check for common default branch names
+    try:
+        for candidate in ("main", "master"):
+            result = subprocess.run(
+                ["git", "rev-parse", "--verify", candidate], capture_output=True, text=True, check=False
+            )
+            if result.returncode == 0:
+                return candidate
+    except FileNotFoundError:
+        pass
+
+    return None
+
 
-def _resolve_pr_branch(pr_number: int) -> str:
-    """Resolve a PR number to its head branch name using gh CLI."""
+def resolve_pr_branch(pr_number: int) -> str:
     try:
         result = subprocess.run(
             ["gh", "pr", "view", str(pr_number), "--json", "headRefName", "-q", ".headRefName"],
@@ -91,7 +196,7 @@ def _resolve_pr_branch(pr_number: int) -> str:
         sys.exit(1)
 
 
-def _parse_functions_arg(functions_str: str, project_root: Path) -> dict[Path, list[FunctionToOptimize]]:
+def parse_functions_arg(functions_str: str, project_root: Path) -> dict[Path, list[FunctionToOptimize]]:
     """Parse --functions arg format: 'file.py::func1,func2;other.py::func3'."""
     from codeflash.models.function_types import FunctionToOptimize
 
 
@@ -127,7 +127,8 @@ def run_benchmarks(
                     function_benchmark_timings = CodeFlashBenchmarkPlugin.get_function_benchmark_timings(
                         self.trace_file
                     )
-                    total_benchmark_timings = CodeFlashBenchmarkPlugin.get_benchmark_timings(self.trace_file)
+                    total_benchmark_stats = CodeFlashBenchmarkPlugin.get_benchmark_timings(self.trace_file)
+                    total_benchmark_timings = {k: v.median_ns for k, v in total_benchmark_stats.items()}
                     function_to_results = validate_and_format_benchmark_table(
                         function_benchmark_timings, total_benchmark_timings
                     )
 
@@ -53,6 +53,8 @@ dependencies = [
     "filelock>=3.20.3; python_version >= '3.10'",
     "filelock<3.20.3; python_version < '3.10'",
     "pytest-asyncio>=0.18.0",
+    "memray>=1.12; sys_platform != 'win32'",
+    "pytest-memray>=1.7; sys_platform != 'win32'",
 ]
 
 [project.urls]
@@ -339,8 +341,8 @@ vcs = "git"
 
 [tool.hatch.build.hooks.version]
 path = "codeflash/version.py"
-template = """# These version placeholders will be replaced by uv-dynamic-versioning during build.
-__version__ = "{version}"
+template = """# These version placeholders will be replaced by uv-dynamic-versioning during build.
+__version__ = "{version}"
 """
Original file line number	Diff line number	Diff line change
`@@ -127,7 +127,8 @@ def run_benchmarks(`
`127`	`127`	`function_benchmark_timings = CodeFlashBenchmarkPlugin.get_function_benchmark_timings(`
`128`	`128`	`self.trace_file`
`129`	`129`	`)`
`130`		`- total_benchmark_timings = CodeFlashBenchmarkPlugin.get_benchmark_timings(self.trace_file)`
	`130`	`+ total_benchmark_stats = CodeFlashBenchmarkPlugin.get_benchmark_timings(self.trace_file)`
	`131`	`+ total_benchmark_timings = {k: v.median_ns for k, v in total_benchmark_stats.items()}`
`131`	`132`	`function_to_results = validate_and_format_benchmark_table(`
`132`	`133`	`function_benchmark_timings, total_benchmark_timings`
`133`	`134`	`)`