Skip to content

Commit 5c1cc1e

Browse files
committed
perf: prefilter files by path before read_text() in discovery
Moves path-based checks (test file detection, ignore paths, submodule paths, site-packages, outside module-root) to run BEFORE read_text() is called in get_all_files_and_functions() and get_functions_within_lines(). This avoids unnecessary file I/O for files that would be discarded by filter_functions() anyway. Also fixes pre-existing mypy error in _find_all_functions_via_language_support where discover_functions was called with wrong argument order. Signature changes (backward-compatible, all new params are optional): - get_all_files_and_functions: added tests_root, module_root params - get_functions_within_lines: added tests_root, ignore_paths, module_root - get_functions_within_git_diff: added tests_root, ignore_paths, module_root
1 parent 0a2ec48 commit 5c1cc1e

2 files changed

Lines changed: 220 additions & 7 deletions

File tree

codeflash/discovery/functions_to_optimize.py

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,8 @@ def _find_all_functions_via_language_support(file_path: Path) -> dict[Path, list
197197
lang_support = get_language_support(file_path)
198198
require_return = lang_support.language != Language.JAVA
199199
criteria = FunctionFilterCriteria(require_return=require_return)
200-
functions[file_path] = lang_support.discover_functions(file_path, criteria)
200+
source = file_path.read_text(encoding="utf-8")
201+
functions[file_path] = lang_support.discover_functions(source, file_path, criteria)
201202
except Exception as e:
202203
logger.debug(f"Failed to discover functions in {file_path}: {e}")
203204

@@ -226,7 +227,9 @@ def get_functions_to_optimize(
226227
if optimize_all:
227228
logger.info("!lsp|Finding all functions in the module '%s'…", optimize_all)
228229
console.rule()
229-
functions = get_all_files_and_functions(Path(optimize_all), ignore_paths)
230+
functions = get_all_files_and_functions(
231+
Path(optimize_all), ignore_paths, tests_root=test_cfg.tests_root, module_root=module_root
232+
)
230233
elif replay_test:
231234
functions, trace_file_path = get_all_replay_test_functions(
232235
replay_test=replay_test, test_cfg=test_cfg, project_root_path=project_root
@@ -317,7 +320,12 @@ def get_functions_to_optimize(
317320
logger.info("Finding all functions modified in the current git diff ...")
318321
console.rule()
319322
ph("cli-optimizing-git-diff")
320-
functions = get_functions_within_git_diff(uncommitted_changes=False)
323+
functions = get_functions_within_git_diff(
324+
uncommitted_changes=False,
325+
tests_root=test_cfg.tests_root,
326+
ignore_paths=ignore_paths,
327+
module_root=module_root,
328+
)
321329
filtered_modified_functions, functions_count = filter_functions(
322330
functions, test_cfg.tests_root, ignore_paths, project_root, module_root, previous_checkpoint_functions
323331
)
@@ -326,9 +334,16 @@ def get_functions_to_optimize(
326334
return filtered_modified_functions, functions_count, trace_file_path
327335

328336

329-
def get_functions_within_git_diff(uncommitted_changes: bool) -> dict[Path, list[FunctionToOptimize]]:
337+
def get_functions_within_git_diff(
338+
uncommitted_changes: bool,
339+
tests_root: Path | None = None,
340+
ignore_paths: list[Path] | None = None,
341+
module_root: Path | None = None,
342+
) -> dict[Path, list[FunctionToOptimize]]:
330343
modified_lines: dict[str, list[int]] = get_git_diff(uncommitted_changes=uncommitted_changes)
331-
return get_functions_within_lines(modified_lines)
344+
return get_functions_within_lines(
345+
modified_lines, tests_root=tests_root, ignore_paths=ignore_paths, module_root=module_root
346+
)
332347

333348

334349
def closest_matching_file_function_name(
@@ -406,12 +421,20 @@ def get_functions_inside_a_commit(commit_hash: str) -> dict[Path, list[FunctionT
406421
return get_functions_within_lines(modified_lines)
407422

408423

409-
def get_functions_within_lines(modified_lines: dict[str, list[int]]) -> dict[Path, list[FunctionToOptimize]]:
424+
def get_functions_within_lines(
425+
modified_lines: dict[str, list[int]],
426+
tests_root: Path | None = None,
427+
ignore_paths: list[Path] | None = None,
428+
module_root: Path | None = None,
429+
) -> dict[Path, list[FunctionToOptimize]]:
410430
functions: dict[Path, list[FunctionToOptimize]] = {}
411431
for path_str, lines_in_file in modified_lines.items():
412432
path = Path(path_str)
413433
if not path.exists():
414434
continue
435+
if tests_root is not None and module_root is not None:
436+
if not filter_files_optimized(path, tests_root, ignore_paths or [], module_root):
437+
continue
415438
all_functions = find_all_functions_in_file(path)
416439
functions[path] = [
417440
func
@@ -424,21 +447,30 @@ def get_functions_within_lines(modified_lines: dict[str, list[int]]) -> dict[Pat
424447

425448

426449
def get_all_files_and_functions(
427-
module_root_path: Path, ignore_paths: list[Path], language: Language | None = None
450+
module_root_path: Path,
451+
ignore_paths: list[Path],
452+
language: Language | None = None,
453+
tests_root: Path | None = None,
454+
module_root: Path | None = None,
428455
) -> dict[Path, list[FunctionToOptimize]]:
429456
"""Get all optimizable functions from files in the module root.
430457
431458
Args:
432459
module_root_path: Root path to search for source files.
433460
ignore_paths: List of paths to ignore.
434461
language: Optional specific language to filter for. If None, includes all supported languages.
462+
tests_root: Test root path for prefiltering files before reading (avoids unnecessary I/O).
463+
module_root: Module root path for prefiltering files before reading.
435464
436465
Returns:
437466
Dictionary mapping file paths to lists of FunctionToOptimize.
438467
439468
"""
440469
functions: dict[Path, list[FunctionToOptimize]] = {}
441470
for file_path in get_files_for_language(module_root_path, ignore_paths, language):
471+
if tests_root is not None and module_root is not None:
472+
if not filter_files_optimized(file_path, tests_root, ignore_paths, module_root):
473+
continue
442474
functions.update(find_all_functions_in_file(file_path).items())
443475
# Randomize the order of the files to optimize to avoid optimizing the same file in the same order every time.
444476
# Helpful if an optimize-all run is stuck and we restart it.

tests/test_discovery_prefilter.py

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
from __future__ import annotations
2+
3+
from pathlib import Path
4+
from unittest.mock import patch
5+
6+
from codeflash.discovery.functions_to_optimize import (
7+
get_all_files_and_functions,
8+
get_functions_within_lines,
9+
)
10+
11+
12+
def test_prefilter_skips_test_files(tmp_path: Path) -> None:
13+
"""Files in tests_root should be skipped before read_text() is called."""
14+
module_root = tmp_path / "src"
15+
module_root.mkdir()
16+
tests_root = tmp_path / "tests"
17+
tests_root.mkdir()
18+
19+
source_file = module_root / "app.py"
20+
source_file.write_text("def compute():\n return 1\n", encoding="utf-8")
21+
22+
test_file = tests_root / "test_app.py"
23+
test_file.write_text("def test_compute():\n return True\n", encoding="utf-8")
24+
25+
with patch("codeflash.discovery.functions_to_optimize.get_files_for_language") as mock_get_files:
26+
mock_get_files.return_value = [source_file, test_file]
27+
result = get_all_files_and_functions(
28+
module_root, ignore_paths=[], tests_root=tests_root, module_root=module_root
29+
)
30+
31+
assert source_file in result
32+
assert test_file not in result
33+
34+
35+
def test_prefilter_skips_ignored_paths(tmp_path: Path) -> None:
36+
"""Files in ignore_paths should be skipped before read_text() is called."""
37+
module_root = tmp_path / "src"
38+
module_root.mkdir()
39+
tests_root = tmp_path / "tests"
40+
tests_root.mkdir()
41+
ignored_dir = module_root / "vendor"
42+
ignored_dir.mkdir()
43+
44+
source_file = module_root / "app.py"
45+
source_file.write_text("def compute():\n return 1\n", encoding="utf-8")
46+
47+
vendor_file = ignored_dir / "lib.py"
48+
vendor_file.write_text("def helper():\n return 2\n", encoding="utf-8")
49+
50+
with patch("codeflash.discovery.functions_to_optimize.get_files_for_language") as mock_get_files:
51+
mock_get_files.return_value = [source_file, vendor_file]
52+
result = get_all_files_and_functions(
53+
module_root, ignore_paths=[ignored_dir], tests_root=tests_root, module_root=module_root
54+
)
55+
56+
assert source_file in result
57+
assert vendor_file not in result
58+
59+
60+
def test_prefilter_skips_files_outside_module_root(tmp_path: Path) -> None:
61+
"""Files outside module_root should be skipped before read_text() is called."""
62+
module_root = tmp_path / "src"
63+
module_root.mkdir()
64+
tests_root = tmp_path / "tests"
65+
tests_root.mkdir()
66+
other_dir = tmp_path / "other"
67+
other_dir.mkdir()
68+
69+
source_file = module_root / "app.py"
70+
source_file.write_text("def compute():\n return 1\n", encoding="utf-8")
71+
72+
outside_file = other_dir / "stray.py"
73+
outside_file.write_text("def stray():\n return 3\n", encoding="utf-8")
74+
75+
with patch("codeflash.discovery.functions_to_optimize.get_files_for_language") as mock_get_files:
76+
mock_get_files.return_value = [source_file, outside_file]
77+
result = get_all_files_and_functions(
78+
module_root, ignore_paths=[], tests_root=tests_root, module_root=module_root
79+
)
80+
81+
assert source_file in result
82+
assert outside_file not in result
83+
84+
85+
def test_prefilter_disabled_without_params(tmp_path: Path) -> None:
86+
"""Without tests_root/module_root, no prefiltering occurs (backward compat)."""
87+
module_root = tmp_path / "src"
88+
module_root.mkdir()
89+
90+
source_file = module_root / "app.py"
91+
source_file.write_text("def compute():\n return 1\n", encoding="utf-8")
92+
93+
with patch("codeflash.discovery.functions_to_optimize.get_files_for_language") as mock_get_files:
94+
mock_get_files.return_value = [source_file]
95+
result = get_all_files_and_functions(module_root, ignore_paths=[])
96+
97+
assert source_file in result
98+
99+
100+
def test_prefilter_in_get_functions_within_lines(tmp_path: Path) -> None:
101+
"""get_functions_within_lines should skip test files when prefilter params are provided."""
102+
module_root = tmp_path / "src"
103+
module_root.mkdir()
104+
tests_root = tmp_path / "tests"
105+
tests_root.mkdir()
106+
107+
source_file = module_root / "app.py"
108+
source_file.write_text("def compute():\n return 1\n", encoding="utf-8")
109+
110+
test_file = tests_root / "test_app.py"
111+
test_file.write_text("def test_compute():\n return True\n", encoding="utf-8")
112+
113+
modified_lines = {
114+
str(source_file): [1, 2],
115+
str(test_file): [1, 2],
116+
}
117+
118+
result = get_functions_within_lines(
119+
modified_lines, tests_root=tests_root, ignore_paths=[], module_root=module_root
120+
)
121+
122+
assert source_file in result
123+
assert test_file not in result
124+
125+
126+
def test_prefilter_avoids_reading_skipped_files(tmp_path: Path) -> None:
127+
"""Verify that find_all_functions_in_file is NOT called for prefiltered files (the core perf win)."""
128+
module_root = tmp_path / "src"
129+
module_root.mkdir()
130+
tests_root = tmp_path / "tests"
131+
tests_root.mkdir()
132+
133+
source_file = module_root / "app.py"
134+
source_file.write_text("def compute():\n return 1\n", encoding="utf-8")
135+
136+
test_file = tests_root / "test_app.py"
137+
test_file.write_text("def test_compute():\n return True\n", encoding="utf-8")
138+
139+
with (
140+
patch("codeflash.discovery.functions_to_optimize.get_files_for_language") as mock_get_files,
141+
patch("codeflash.discovery.functions_to_optimize.find_all_functions_in_file") as mock_find,
142+
):
143+
mock_get_files.return_value = [source_file, test_file]
144+
mock_find.return_value = {}
145+
get_all_files_and_functions(
146+
module_root, ignore_paths=[], tests_root=tests_root, module_root=module_root
147+
)
148+
149+
# find_all_functions_in_file (which does read_text) should only be called for source_file
150+
assert mock_find.call_count == 1
151+
mock_find.assert_called_once_with(source_file)
152+
153+
154+
def test_prefilter_skips_submodule_paths(tmp_path: Path) -> None:
155+
"""Submodule paths should be skipped by prefilter."""
156+
module_root = tmp_path / "src"
157+
module_root.mkdir()
158+
tests_root = tmp_path / "tests"
159+
tests_root.mkdir()
160+
submodule_dir = module_root / "vendor_submodule"
161+
submodule_dir.mkdir()
162+
163+
source_file = module_root / "app.py"
164+
source_file.write_text("def compute():\n return 1\n", encoding="utf-8")
165+
166+
submodule_file = submodule_dir / "lib.py"
167+
submodule_file.write_text("def helper():\n return 2\n", encoding="utf-8")
168+
169+
with (
170+
patch("codeflash.discovery.functions_to_optimize.get_files_for_language") as mock_get_files,
171+
patch(
172+
"codeflash.discovery.functions_to_optimize.ignored_submodule_paths", return_value=[submodule_dir]
173+
),
174+
):
175+
mock_get_files.return_value = [source_file, submodule_file]
176+
result = get_all_files_and_functions(
177+
module_root, ignore_paths=[], tests_root=tests_root, module_root=module_root
178+
)
179+
180+
assert source_file in result
181+
assert submodule_file not in result

0 commit comments

Comments
 (0)