feat: add --memory flag to codeflash compare for peak memory profiling

KRRT7 · KRRT7 · commit 279a8fcb29d9 · 2026-04-02T10:29:31.000-05:00
Adds a second profiling phase using pytest-memray that runs after timing
benchmarks. Memory tables are suppressed when the delta is &lt;1%.
diff --git a/codeflash/benchmarking/compare.py b/codeflash/benchmarking/compare.py
@@ -25,7 +25,7 @@
 if TYPE_CHECKING:
     from collections.abc import Callable
 
-    from codeflash.benchmarking.plugin.plugin import BenchmarkStats
+    from codeflash.benchmarking.plugin.plugin import BenchmarkStats, MemoryStats
     from codeflash.models.function_types import FunctionToOptimize
     from codeflash.models.models import BenchmarkKey
 
@@ -42,6 +42,8 @@ class CompareResult:
     head_stats: dict[BenchmarkKey, BenchmarkStats] = field(default_factory=dict)
     base_function_ns: dict[str, dict[BenchmarkKey, float]] = field(default_factory=dict)
     head_function_ns: dict[str, dict[BenchmarkKey, float]] = field(default_factory=dict)
+    base_memory: dict[BenchmarkKey, MemoryStats] = field(default_factory=dict)
+    head_memory: dict[BenchmarkKey, MemoryStats] = field(default_factory=dict)
 
     def format_markdown(self) -> str:
         if not self.base_stats and not self.head_stats:
@@ -106,6 +108,29 @@ def sort_key(fn: str, _bm_key: BenchmarkKey = bm_key) -> float:
                         f"| `{short_name}` | {fmt_us(b)} | {fmt_us(h)} | {md_bar(b, h)} | {md_speedup(b, h)} |"
                     )
 
+            # Memory section (skip when delta is negligible)
+            base_mem = self.base_memory.get(bm_key)
+            head_mem = self.head_memory.get(bm_key)
+            if has_meaningful_memory_change(base_mem, head_mem):
+                lines.append("")
+                lines.append("#### Memory")
+                lines.append("")
+                lines.append("| Ref | Peak Memory | Allocations | Delta |")
+                lines.append("|:---|---:|---:|:---|")
+                if base_mem:
+                    lines.append(
+                        f"| `{base_short}` (base) | {md_bytes(base_mem.peak_memory_bytes)}"
+                        f" | {base_mem.total_allocations:,} | |"
+                    )
+                if head_mem:
+                    delta = md_memory_delta(
+                        base_mem.peak_memory_bytes if base_mem else None, head_mem.peak_memory_bytes
+                    )
+                    lines.append(
+                        f"| `{head_short}` (head) | {md_bytes(head_mem.peak_memory_bytes)}"
+                        f" | {head_mem.total_allocations:,} | {delta} |"
+                    )
+
             sections.append("\n".join(lines))
 
         sections.append("---\n*Generated by codeflash optimization agent*")
@@ -120,16 +145,23 @@ def compare_branches(
     tests_root: Path,
     functions: Optional[dict[Path, list[FunctionToOptimize]]] = None,
     timeout: int = 600,
+    memory: bool = False,
 ) -> CompareResult:
     """Compare benchmark performance between two git refs.
 
     If functions is None, auto-detects changed functions from git diff.
     Returns a CompareResult with timing data from both refs.
     """
+    import sys
+
     from codeflash.benchmarking.instrument_codeflash_trace import instrument_codeflash_trace_decorator
     from codeflash.benchmarking.plugin.plugin import CodeFlashBenchmarkPlugin
     from codeflash.benchmarking.trace_benchmarks import trace_benchmarks_pytest
 
+    if memory and sys.platform == "win32":
+        logger.error("--memory requires memray which is not available on Windows")
+        return CompareResult(base_ref=base_ref, head_ref=head_ref)
+
     repo = git.Repo(project_root, search_parent_directories=True)
     repo_root = Path(repo.working_dir)
 
@@ -182,12 +214,17 @@ def compare_branches(
     head_worktree = worktree_dirs / f"compare-head-{timestamp}"
     base_trace_db = worktree_dirs / f"trace-base-{timestamp}.db"
     head_trace_db = worktree_dirs / f"trace-head-{timestamp}.db"
+    base_memray_dir = worktree_dirs / f"memray-base-{timestamp}"
+    head_memray_dir = worktree_dirs / f"memray-head-{timestamp}"
+    memray_prefix = "cf-mem"
 
     result = CompareResult(base_ref=base_ref, head_ref=head_ref)
 
     from rich.console import Group
 
     step_labels = ["Creating worktrees", f"Benchmarking base ({base_short})", f"Benchmarking head ({head_short})"]
+    if memory:
+        step_labels.extend([f"Memory profiling base ({base_short})", f"Memory profiling head ({head_short})"])
 
     def build_steps(current_step: int) -> Group:
         lines: list[Text] = []
@@ -260,6 +297,18 @@ def build_panel(current_step: int) -> Panel:
                 trace_fn=trace_benchmarks_pytest,
             )
 
+            # Steps 4-5: Memory profiling (reuses existing worktrees)
+            if memory:
+                from codeflash.benchmarking.trace_benchmarks import memory_benchmarks_pytest
+
+                live.update(build_panel(3))
+                wt_base_benchmarks = base_worktree / benchmarks_root.relative_to(repo_root)
+                memory_benchmarks_pytest(wt_base_benchmarks, base_worktree, base_memray_dir, memray_prefix, timeout)
+
+                live.update(build_panel(4))
+                wt_head_benchmarks = head_worktree / benchmarks_root.relative_to(repo_root)
+                memory_benchmarks_pytest(wt_head_benchmarks, head_worktree, head_memray_dir, memray_prefix, timeout)
+
         # Load results
         if base_trace_db.exists():
             result.base_stats = CodeFlashBenchmarkPlugin.get_benchmark_timings(base_trace_db)
@@ -269,6 +318,14 @@ def build_panel(current_step: int) -> Panel:
             result.head_stats = CodeFlashBenchmarkPlugin.get_benchmark_timings(head_trace_db)
             result.head_function_ns = CodeFlashBenchmarkPlugin.get_function_benchmark_timings(head_trace_db)
 
+        if memory:
+            from codeflash.benchmarking.plugin.plugin import MemoryStats
+
+            if base_memray_dir.exists():
+                result.base_memory = MemoryStats.parse_memray_results(base_memray_dir, memray_prefix)
+            if head_memray_dir.exists():
+                result.head_memory = MemoryStats.parse_memray_results(head_memray_dir, memray_prefix)
+
         # Render comparison
         render_comparison(result)
 
@@ -282,10 +339,16 @@ def build_panel(current_step: int) -> Panel:
         remove_worktree(base_worktree)
         remove_worktree(head_worktree)
         repo.git.worktree("prune")
-        # Cleanup trace DBs
+        # Cleanup trace DBs and memray dirs
         for db in [base_trace_db, head_trace_db]:
             if db.exists():
                 db.unlink()
+        if memory:
+            import shutil
+
+            for memray_dir in [base_memray_dir, head_memray_dir]:
+                if memray_dir.exists():
+                    shutil.rmtree(memray_dir)
 
     return result
 
@@ -543,6 +606,31 @@ def sort_key(fn: str, _bm_key: BenchmarkKey = bm_key) -> float:
 
             console.print(t2, justify="center")
 
+        # Table 3: Memory (skip when delta is negligible)
+        base_mem = result.base_memory.get(bm_key)
+        head_mem = result.head_memory.get(bm_key)
+        if has_meaningful_memory_change(base_mem, head_mem):
+            console.print()
+            t3 = Table(title="Memory (peak per test)", border_style="magenta", show_lines=True, expand=False)
+            t3.add_column("Ref", style="bold cyan")
+            t3.add_column("Peak Memory", justify="right")
+            t3.add_column("Allocations", justify="right")
+            t3.add_column("Delta", justify="right")
+
+            if base_mem:
+                t3.add_row(
+                    f"{base_short} (base)", fmt_bytes(base_mem.peak_memory_bytes), f"{base_mem.total_allocations:,}", ""
+                )
+            if head_mem:
+                delta = fmt_memory_delta(base_mem.peak_memory_bytes if base_mem else None, head_mem.peak_memory_bytes)
+                t3.add_row(
+                    f"{head_short} (head)",
+                    fmt_bytes(head_mem.peak_memory_bytes),
+                    f"{head_mem.total_allocations:,}",
+                    delta,
+                )
+            console.print(t3, justify="center")
+
     console.print()
 
 
@@ -641,3 +729,63 @@ def md_bar(before: Optional[float], after: Optional[float], width: int = 10) ->
     filled = min(filled, width)
     bar = "\u2588" * filled + "\u2591" * (width - filled)
     return f"`{bar}` {pct:+.0f}%"
+
+
+def fmt_bytes(b: Optional[int]) -> str:
+    if b is None:
+        return "-"
+    if b >= 1 << 30:
+        return f"{b / (1 << 30):,.1f} GiB"
+    if b >= 1 << 20:
+        return f"{b / (1 << 20):,.1f} MiB"
+    if b >= 1 << 10:
+        return f"{b / (1 << 10):,.1f} KiB"
+    return f"{b:,} B"
+
+
+def fmt_memory_delta(before: Optional[int], after: Optional[int]) -> str:
+    if before is None or after is None or before == 0:
+        return "-"
+    pct = ((after - before) / before) * 100
+    if pct < 0:
+        return _GREEN_TPL % pct
+    return _RED_TPL % pct
+
+
+def md_bytes(b: Optional[int]) -> str:
+    if b is None:
+        return "-"
+    if b >= 1 << 30:
+        return f"{b / (1 << 30):,.1f} GiB"
+    if b >= 1 << 20:
+        return f"{b / (1 << 20):,.1f} MiB"
+    if b >= 1 << 10:
+        return f"{b / (1 << 10):,.1f} KiB"
+    return f"{b:,} B"
+
+
+def md_memory_delta(before: Optional[int], after: Optional[int]) -> str:
+    if before is None or after is None or before == 0:
+        return "-"
+    pct = ((after - before) / before) * 100
+    emoji = "\U0001f7e2" if pct <= 0 else "\U0001f534"
+    return f"{emoji} {pct:+.0f}%"
+
+
+def has_meaningful_memory_change(
+    base_mem: Optional[MemoryStats], head_mem: Optional[MemoryStats], threshold_pct: float = 1.0
+) -> bool:
+    """Return True if peak memory or allocation count changed by more than threshold_pct."""
+    if base_mem is None or head_mem is None:
+        return base_mem is not None or head_mem is not None
+    if base_mem.peak_memory_bytes == 0 and head_mem.peak_memory_bytes == 0:
+        return False
+    if base_mem.peak_memory_bytes > 0:
+        mem_pct = abs((head_mem.peak_memory_bytes - base_mem.peak_memory_bytes) / base_mem.peak_memory_bytes) * 100
+        if mem_pct > threshold_pct:
+            return True
+    if base_mem.total_allocations > 0:
+        alloc_pct = abs((head_mem.total_allocations - base_mem.total_allocations) / base_mem.total_allocations) * 100
+        if alloc_pct > threshold_pct:
+            return True
+    return False
diff --git a/codeflash/benchmarking/plugin/plugin.py b/codeflash/benchmarking/plugin/plugin.py
@@ -68,6 +68,51 @@ def from_per_iteration_times(times_ns: list[float], iterations: int) -> Benchmar
         )
 
 
+@dataclass
+class MemoryStats:
+    peak_memory_bytes: int
+    total_allocations: int
+
+    @staticmethod
+    def parse_memray_results(bin_dir: Path, bin_prefix: str) -> dict:
+        from codeflash.models.models import BenchmarkKey
+
+        try:
+            from memray import FileReader
+        except ImportError as e:
+            msg = "memray is required for --memory profiling. Install with: uv add memray pytest-memray"
+            raise ImportError(msg) from e
+
+        results: dict[BenchmarkKey, MemoryStats] = {}
+        for bin_file in sorted(bin_dir.glob(f"{bin_prefix}-*.bin")):
+            stem = bin_file.stem
+            # pytest-memray names: {prefix}-{nodeid with :: and os.sep replaced by -}.bin
+            nodeid_part = stem[len(bin_prefix) + 1 :]  # strip "{prefix}-"
+            # Extract the test function name (last segment after the final -)
+            # Node IDs look like: tests-benchmarks-test_file.py-test_func_name
+            # We need the module_path and function_name for BenchmarkKey
+            # Split on ".py-" to separate module path from function name
+            parts = nodeid_part.split(".py-", 1)
+            if len(parts) == 2:
+                module_part = parts[0].replace("-", ".")
+                function_name = parts[1]
+            else:
+                module_part = nodeid_part.rsplit("-", 1)[0].replace("-", ".")
+                function_name = nodeid_part.rsplit("-", 1)[-1] if "-" in nodeid_part else nodeid_part
+
+            try:
+                reader = FileReader(str(bin_file))
+                meta = reader.metadata
+                bm_key = BenchmarkKey(module_path=module_part, function_name=function_name)
+                results[bm_key] = MemoryStats(
+                    peak_memory_bytes=meta.peak_memory, total_allocations=meta.total_allocations
+                )
+                reader.close()
+            except OSError:
+                continue
+        return results
+
+
 class CodeFlashBenchmarkPlugin:
     def __init__(self) -> None:
         self._trace_path = None
diff --git a/codeflash/benchmarking/pytest_new_process_memory_benchmarks.py b/codeflash/benchmarking/pytest_new_process_memory_benchmarks.py
@@ -0,0 +1,42 @@
+"""Subprocess entry point for memory profiling benchmarks via pytest-memray.
+
+Runs pytest with --memray --native to profile peak memory per test function.
+The codeflash-benchmark plugin is left active (without --codeflash-trace) so it
+provides a no-op ``benchmark`` fixture for tests that depend on it.
+"""
+
+import sys
+from pathlib import Path
+
+benchmarks_root = sys.argv[1]
+memray_bin_dir = sys.argv[2]
+memray_bin_prefix = sys.argv[3]
+
+if __name__ == "__main__":
+    import pytest
+
+    Path(memray_bin_dir).mkdir(parents=True, exist_ok=True)
+
+    exitcode = pytest.main(
+        [
+            benchmarks_root,
+            "--memray",
+            "--native",
+            f"--memray-bin-path={memray_bin_dir}",
+            f"--memray-bin-prefix={memray_bin_prefix}",
+            "--hide-memray-summary",
+            "-p",
+            "no:benchmark",
+            "-p",
+            "no:codspeed",
+            "-p",
+            "no:cov",
+            "-p",
+            "no:profiling",
+            "-s",
+            "-o",
+            "addopts=",
+        ]
+    )
+
+    sys.exit(exitcode)
diff --git a/codeflash/benchmarking/trace_benchmarks.py b/codeflash/benchmarking/trace_benchmarks.py
@@ -46,3 +46,39 @@ def trace_benchmarks_pytest(
             error_section = combined_output
         logger.warning(f"Error collecting benchmarks - Pytest Exit code: {result.returncode}, {error_section}")
         logger.debug(f"Full pytest output:\n{combined_output}")
+
+
+def memory_benchmarks_pytest(
+    benchmarks_root: Path, project_root: Path, memray_bin_dir: Path, memray_bin_prefix: str, timeout: int = 300
+) -> None:
+    benchmark_env = make_env_with_project_root(project_root)
+    run_args = get_cross_platform_subprocess_run_args(
+        cwd=project_root, env=benchmark_env, timeout=timeout, check=False, text=True, capture_output=True
+    )
+    result = subprocess.run(  # noqa: PLW1510
+        [
+            SAFE_SYS_EXECUTABLE,
+            Path(__file__).parent / "pytest_new_process_memory_benchmarks.py",
+            benchmarks_root,
+            memray_bin_dir,
+            memray_bin_prefix,
+        ],
+        **run_args,
+    )
+    if result.returncode != 0:
+        combined_output = result.stdout
+        if result.stderr:
+            combined_output = combined_output + "\n" + result.stderr if combined_output else result.stderr
+
+        if "ERROR collecting" in combined_output:
+            error_pattern = r"={3,}\s*ERRORS\s*={3,}\n([\s\S]*?)(?:={3,}|$)"
+            match = re.search(error_pattern, combined_output)
+            error_section = match.group(1) if match else combined_output
+        elif "FAILURES" in combined_output:
+            error_pattern = r"={3,}\s*FAILURES\s*={3,}\n([\s\S]*?)(?:={3,}|$)"
+            match = re.search(error_pattern, combined_output)
+            error_section = match.group(1) if match else combined_output
+        else:
+            error_section = combined_output
+        logger.warning(f"Error collecting memory benchmarks - Pytest Exit code: {result.returncode}, {error_section}")
+        logger.debug(f"Full pytest output:\n{combined_output}")
diff --git a/codeflash/cli_cmds/cli.py b/codeflash/cli_cmds/cli.py
@@ -392,6 +392,9 @@ def _build_parser() -> ArgumentParser:
     )
     compare_parser.add_argument("--timeout", type=int, default=600, help="Benchmark timeout in seconds (default: 600)")
     compare_parser.add_argument("--output", "-o", type=str, help="Write markdown report to file")
+    compare_parser.add_argument(
+        "--memory", action="store_true", help="Profile peak memory usage per benchmark (requires memray, Linux/macOS)"
+    )
     compare_parser.add_argument("--config-file", type=str, dest="config_file", help="Path to pyproject.toml")
 
     trace_optimize = subparsers.add_parser("optimize", help="Trace and optimize your project.")
diff --git a/codeflash/cli_cmds/cmd_compare.py b/codeflash/cli_cmds/cmd_compare.py
@@ -73,6 +73,7 @@ def run_compare(args: Namespace) -> None:
         tests_root=tests_root,
         functions=functions,
         timeout=args.timeout,
+        memory=getattr(args, "memory", False),
     )
 
     if not result.base_stats and not result.head_stats:
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -392,6 +392,9 @@ def _build_parser() -> ArgumentParser:`
`392`	`392`	`)`
`393`	`393`	`compare_parser.add_argument("--timeout", type=int, default=600, help="Benchmark timeout in seconds (default: 600)")`
`394`	`394`	`compare_parser.add_argument("--output", "-o", type=str, help="Write markdown report to file")`
	`395`	`+ compare_parser.add_argument(`
	`396`	`+ "--memory", action="store_true", help="Profile peak memory usage per benchmark (requires memray, Linux/macOS)"`
	`397`	`+ )`
`395`	`398`	`compare_parser.add_argument("--config-file", type=str, dest="config_file", help="Path to pyproject.toml")`
`396`	`399`
`397`	`400`	`trace_optimize = subparsers.add_parser("optimize", help="Trace and optimize your project.")`
Original file line number	Diff line number	Diff line change
`@@ -73,6 +73,7 @@ def run_compare(args: Namespace) -> None:`
`73`	`73`	`tests_root=tests_root,`
`74`	`74`	`functions=functions,`
`75`	`75`	`timeout=args.timeout,`
	`76`	`+ memory=getattr(args, "memory", False),`
`76`	`77`	`)`
`77`	`78`
`78`	`79`	`if not result.base_stats and not result.head_stats:`