Skip to content

Commit 2f42428

Browse files
cdeustclaude
andauthored
fix(codebase_analyze): default to no file cap, treat max_files<=0 as unbounded (#25)
The previous default of max_files=500 — and the cortex-setup-project skill's explicit max_files=5000 — silently truncated real codebases at exactly the cap. Two of the user's repos (~ai-architect-prd-builder, ai-prd) hit 5000 exactly during a full-scale bootstrap, meaning files were dropped from the knowledge graph. Change the contract so max_files<=0 means "no limit". Split the helper into _collect_bounded (preserves ADR-0045 §R2 streaming for callers who opt into a cap) and _collect_unbounded (walks the whole tree but only materialises post-filter survivors — memory is O(filtered_files), not O(tree_size)). Default now 0 across handler, schema, and tool registry. Add 4 regression tests: 7500-file unbounded walk, negative-as-unbounded, language/IGNORE_DIRS filtering still applied in unbounded mode, plus the existing 7 bounded-mode tests unchanged. Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 351f06c commit 2f42428

4 files changed

Lines changed: 109 additions & 38 deletions

File tree

mcp_server/handlers/codebase_analyze.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -93,11 +93,10 @@
9393
},
9494
"max_files": {
9595
"type": "integer",
96-
"description": "Maximum number of files to process per call. Cap to avoid runaway analysis on monorepos.",
97-
"default": 500,
98-
"minimum": 1,
99-
"maximum": 50000,
100-
"examples": [100, 500, 5000],
96+
"description": "Maximum number of files to process per call. Set to 0 (default) for no limit — process every matching file. Use a positive cap only to bound runaway analysis on extremely large monorepos.",
97+
"default": 0,
98+
"minimum": 0,
99+
"examples": [0, 500, 5000],
101100
},
102101
"max_file_size_kb": {
103102
"type": "integer",
@@ -129,7 +128,8 @@
129128
CODEBASE_SOURCE = "codebase_analyze"
130129
CODEBASE_TAG = "codebase"
131130
LANG_TAG_PREFIX = "lang:"
132-
DEFAULT_MAX_FILES = 500
131+
# 0 = no limit. Positive values cap the walk; see helpers.collect_source_files.
132+
DEFAULT_MAX_FILES = 0
133133
DEFAULT_MAX_FILE_SIZE_KB = 100
134134

135135
_store: MemoryStore | None = None

mcp_server/handlers/codebase_analyze_helpers.py

Lines changed: 65 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -38,48 +38,82 @@ def collect_source_files(
3838
3939
Preconditions:
4040
- ``root`` is an existing directory.
41-
- ``max_files > 0`` and ``max_bytes > 0``.
41+
- ``max_bytes > 0``.
42+
- ``max_files`` may be any integer; ``<= 0`` means "no limit" and
43+
processes every matching file in the tree.
4244
4345
Postconditions:
44-
- Returns at most ``max_files`` paths, each referring to a regular file
45-
whose extension maps to a known language (and satisfies ``languages``
46-
if supplied), and whose size is ``<= max_bytes``.
47-
- Peak memory footprint is O(max_files * CANDIDATE_MULTIPLIER) paths,
48-
not O(tree_size) — see ADR-0045 §R2. On a 10M-file monorepo with
49-
``max_files=5000`` we hold at most 50K Path objects during the sort,
50-
not 10M.
51-
52-
Invariant (per iteration): ``len(files) <= max_files``.
46+
- When ``max_files > 0``: returns at most ``max_files`` paths,
47+
and peak memory is O(max_files * CANDIDATE_MULTIPLIER) paths
48+
(ADR-0045 §R2). On a 10M-file monorepo with ``max_files=5000``
49+
we hold at most 50K Path objects during the sort.
50+
- When ``max_files <= 0``: returns every matching path. Peak
51+
memory is O(filtered_files) — we never materialise the whole
52+
tree, only the post-filter survivors.
53+
- Each returned path is a regular file whose extension maps to a
54+
known language (and satisfies ``languages`` if supplied), and
55+
whose size is ``<= max_bytes``.
5356
"""
54-
files: list[Path] = []
5557
lang_filter = set(languages) if languages else None
58+
unbounded = max_files <= 0
59+
60+
if unbounded:
61+
return _collect_unbounded(root, lang_filter, max_bytes)
62+
return _collect_bounded(root, lang_filter, max_files, max_bytes)
63+
64+
65+
def _file_matches(
66+
path: Path,
67+
lang_filter: set[str] | None,
68+
max_bytes: int,
69+
) -> bool:
70+
"""Return True iff ``path`` is a source file we should keep."""
71+
if not path.is_file():
72+
return False
73+
if any(d in path.parts for d in IGNORE_DIRS):
74+
return False
75+
lang = EXT_TO_LANG.get(path.suffix.lower())
76+
if not lang:
77+
return False
78+
if lang_filter and lang not in lang_filter:
79+
return False
80+
try:
81+
if path.stat().st_size > max_bytes:
82+
return False
83+
except OSError:
84+
return False
85+
return True
86+
87+
88+
def _collect_unbounded(
89+
root: Path,
90+
lang_filter: set[str] | None,
91+
max_bytes: int,
92+
) -> list[Path]:
93+
"""Walk the entire tree, filter, then sort. Memory O(filtered_count)."""
94+
survivors = [p for p in root.rglob("*") if _file_matches(p, lang_filter, max_bytes)]
95+
survivors.sort()
96+
return survivors
97+
5698

57-
# Bounded candidate set: take ``max_files * CANDIDATE_MULTIPLIER`` paths
58-
# from the generator, then sort for deterministic ordering. The previous
59-
# ``sorted(root.rglob("*"))`` materialised the entire tree before the
60-
# ``max_files`` cap applied — OOM on large monorepos (ADR-0045 §R2).
99+
def _collect_bounded(
100+
root: Path,
101+
lang_filter: set[str] | None,
102+
max_files: int,
103+
max_bytes: int,
104+
) -> list[Path]:
105+
"""Bounded-candidate walk: take ``max_files * CANDIDATE_MULTIPLIER`` paths
106+
then sort for deterministic ordering. See ADR-0045 §R2.
107+
"""
61108
candidate_cap = max(max_files * CANDIDATE_MULTIPLIER, max_files)
62109
candidates = sorted(itertools.islice(root.rglob("*"), candidate_cap))
63110

111+
files: list[Path] = []
64112
for path in candidates:
65113
if len(files) >= max_files:
66114
break
67-
if not path.is_file():
68-
continue
69-
if any(d in path.parts for d in IGNORE_DIRS):
70-
continue
71-
lang = EXT_TO_LANG.get(path.suffix.lower())
72-
if not lang:
73-
continue
74-
if lang_filter and lang not in lang_filter:
75-
continue
76-
try:
77-
if path.stat().st_size > max_bytes:
78-
continue
79-
except OSError:
80-
continue
81-
files.append(path)
82-
115+
if _file_matches(path, lang_filter, max_bytes):
116+
files.append(path)
83117
return files
84118

85119

mcp_server/tool_registry_manage.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,7 @@ def _register_codebase_analyze(mcp: FastMCP) -> None:
179179
async def tool_codebase_analyze(
180180
directory: str | None = None,
181181
languages: list[str] | None = None,
182-
max_files: int = 500,
182+
max_files: int = 0,
183183
max_file_size_kb: int = 100,
184184
incremental: bool = True,
185185
dry_run: bool = False,

tests_py/handlers/test_codebase_analyze_rglob.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,3 +141,40 @@ def test_deterministic_ordering(tmp_path):
141141
tmp_path, languages=None, max_files=30, max_bytes=1_000_000
142142
)
143143
assert a == b
144+
145+
146+
def test_unbounded_returns_every_match(tmp_path):
147+
"""``max_files=0`` means no cap — return every matching file in the tree.
148+
149+
Regression: previously the handler defaulted to max_files=500 and the
150+
skill called it with max_files=5000, both of which truncated real
151+
codebases. With ``max_files <= 0`` the walk must be exhaustive.
152+
"""
153+
_make_tree(tmp_path, 7500)
154+
files = helpers.collect_source_files(
155+
tmp_path, languages=None, max_files=0, max_bytes=1_000_000
156+
)
157+
assert len(files) == 7500
158+
159+
160+
def test_negative_max_files_is_unbounded(tmp_path):
161+
"""Any non-positive ``max_files`` is treated as unbounded."""
162+
_make_tree(tmp_path, 200)
163+
files = helpers.collect_source_files(
164+
tmp_path, languages=None, max_files=-1, max_bytes=1_000_000
165+
)
166+
assert len(files) == 200
167+
168+
169+
def test_unbounded_still_filters(tmp_path):
170+
"""Unbounded walk still applies language, size, and IGNORE_DIRS filters."""
171+
(tmp_path / "a.py").write_text("x=1")
172+
(tmp_path / "b.js").write_text("var x = 1;")
173+
(tmp_path / "junk.md").write_text("# readme")
174+
(tmp_path / "node_modules").mkdir()
175+
(tmp_path / "node_modules" / "ignored.py").write_text("x=1")
176+
177+
files = helpers.collect_source_files(
178+
tmp_path, languages=["python"], max_files=0, max_bytes=1_000_000
179+
)
180+
assert [p.name for p in files] == ["a.py"]

0 commit comments

Comments
 (0)