fix(codebase_analyze): default to no file cap, treat max_files<=0 as unbounded (#25)

cdeust · claude · web-flow · commit 2f42428c1c21 · 2026-05-12T09:41:56.000+02:00
The previous default of max_files=500 — and the cortex-setup-project skill's
explicit max_files=5000 — silently truncated real codebases at exactly the
cap. Two of the user's repos (~ai-architect-prd-builder, ai-prd) hit 5000
exactly during a full-scale bootstrap, meaning files were dropped from the
knowledge graph.

Change the contract so max_files&lt;=0 means "no limit". Split the helper
into _collect_bounded (preserves ADR-0045 §R2 streaming for callers who
opt into a cap) and _collect_unbounded (walks the whole tree but only
materialises post-filter survivors — memory is O(filtered_files), not
O(tree_size)). Default now 0 across handler, schema, and tool registry.

Add 4 regression tests: 7500-file unbounded walk, negative-as-unbounded,
language/IGNORE_DIRS filtering still applied in unbounded mode, plus the
existing 7 bounded-mode tests unchanged.

Co-authored-by: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/mcp_server/handlers/codebase_analyze.py b/mcp_server/handlers/codebase_analyze.py
@@ -93,11 +93,10 @@
             },
             "max_files": {
                 "type": "integer",
-                "description": "Maximum number of files to process per call. Cap to avoid runaway analysis on monorepos.",
-                "default": 500,
-                "minimum": 1,
-                "maximum": 50000,
-                "examples": [100, 500, 5000],
+                "description": "Maximum number of files to process per call. Set to 0 (default) for no limit — process every matching file. Use a positive cap only to bound runaway analysis on extremely large monorepos.",
+                "default": 0,
+                "minimum": 0,
+                "examples": [0, 500, 5000],
             },
             "max_file_size_kb": {
                 "type": "integer",
@@ -129,7 +128,8 @@
 CODEBASE_SOURCE = "codebase_analyze"
 CODEBASE_TAG = "codebase"
 LANG_TAG_PREFIX = "lang:"
-DEFAULT_MAX_FILES = 500
+# 0 = no limit. Positive values cap the walk; see helpers.collect_source_files.
+DEFAULT_MAX_FILES = 0
 DEFAULT_MAX_FILE_SIZE_KB = 100
 
 _store: MemoryStore | None = None
diff --git a/mcp_server/handlers/codebase_analyze_helpers.py b/mcp_server/handlers/codebase_analyze_helpers.py
@@ -38,48 +38,82 @@ def collect_source_files(
 
     Preconditions:
         - ``root`` is an existing directory.
-        - ``max_files > 0`` and ``max_bytes > 0``.
+        - ``max_bytes > 0``.
+        - ``max_files`` may be any integer; ``<= 0`` means "no limit" and
+          processes every matching file in the tree.
 
     Postconditions:
-        - Returns at most ``max_files`` paths, each referring to a regular file
-          whose extension maps to a known language (and satisfies ``languages``
-          if supplied), and whose size is ``<= max_bytes``.
-        - Peak memory footprint is O(max_files * CANDIDATE_MULTIPLIER) paths,
-          not O(tree_size) — see ADR-0045 §R2. On a 10M-file monorepo with
-          ``max_files=5000`` we hold at most 50K Path objects during the sort,
-          not 10M.
-
-    Invariant (per iteration): ``len(files) <= max_files``.
+        - When ``max_files > 0``: returns at most ``max_files`` paths,
+          and peak memory is O(max_files * CANDIDATE_MULTIPLIER) paths
+          (ADR-0045 §R2). On a 10M-file monorepo with ``max_files=5000``
+          we hold at most 50K Path objects during the sort.
+        - When ``max_files <= 0``: returns every matching path. Peak
+          memory is O(filtered_files) — we never materialise the whole
+          tree, only the post-filter survivors.
+        - Each returned path is a regular file whose extension maps to a
+          known language (and satisfies ``languages`` if supplied), and
+          whose size is ``<= max_bytes``.
     """
-    files: list[Path] = []
     lang_filter = set(languages) if languages else None
+    unbounded = max_files <= 0
+
+    if unbounded:
+        return _collect_unbounded(root, lang_filter, max_bytes)
+    return _collect_bounded(root, lang_filter, max_files, max_bytes)
+
+
+def _file_matches(
+    path: Path,
+    lang_filter: set[str] | None,
+    max_bytes: int,
+) -> bool:
+    """Return True iff ``path`` is a source file we should keep."""
+    if not path.is_file():
+        return False
+    if any(d in path.parts for d in IGNORE_DIRS):
+        return False
+    lang = EXT_TO_LANG.get(path.suffix.lower())
+    if not lang:
+        return False
+    if lang_filter and lang not in lang_filter:
+        return False
+    try:
+        if path.stat().st_size > max_bytes:
+            return False
+    except OSError:
+        return False
+    return True
+
+
+def _collect_unbounded(
+    root: Path,
+    lang_filter: set[str] | None,
+    max_bytes: int,
+) -> list[Path]:
+    """Walk the entire tree, filter, then sort. Memory O(filtered_count)."""
+    survivors = [p for p in root.rglob("*") if _file_matches(p, lang_filter, max_bytes)]
+    survivors.sort()
+    return survivors
+
 
-    # Bounded candidate set: take ``max_files * CANDIDATE_MULTIPLIER`` paths
-    # from the generator, then sort for deterministic ordering. The previous
-    # ``sorted(root.rglob("*"))`` materialised the entire tree before the
-    # ``max_files`` cap applied — OOM on large monorepos (ADR-0045 §R2).
+def _collect_bounded(
+    root: Path,
+    lang_filter: set[str] | None,
+    max_files: int,
+    max_bytes: int,
+) -> list[Path]:
+    """Bounded-candidate walk: take ``max_files * CANDIDATE_MULTIPLIER`` paths
+    then sort for deterministic ordering. See ADR-0045 §R2.
+    """
     candidate_cap = max(max_files * CANDIDATE_MULTIPLIER, max_files)
     candidates = sorted(itertools.islice(root.rglob("*"), candidate_cap))
 
+    files: list[Path] = []
     for path in candidates:
         if len(files) >= max_files:
             break
-        if not path.is_file():
-            continue
-        if any(d in path.parts for d in IGNORE_DIRS):
-            continue
-        lang = EXT_TO_LANG.get(path.suffix.lower())
-        if not lang:
-            continue
-        if lang_filter and lang not in lang_filter:
-            continue
-        try:
-            if path.stat().st_size > max_bytes:
-                continue
-        except OSError:
-            continue
-        files.append(path)
-
+        if _file_matches(path, lang_filter, max_bytes):
+            files.append(path)
     return files
 
 
diff --git a/mcp_server/tool_registry_manage.py b/mcp_server/tool_registry_manage.py
@@ -179,7 +179,7 @@ def _register_codebase_analyze(mcp: FastMCP) -> None:
     async def tool_codebase_analyze(
         directory: str | None = None,
         languages: list[str] | None = None,
-        max_files: int = 500,
+        max_files: int = 0,
         max_file_size_kb: int = 100,
         incremental: bool = True,
         dry_run: bool = False,
diff --git a/tests_py/handlers/test_codebase_analyze_rglob.py b/tests_py/handlers/test_codebase_analyze_rglob.py
@@ -141,3 +141,40 @@ def test_deterministic_ordering(tmp_path):
         tmp_path, languages=None, max_files=30, max_bytes=1_000_000
     )
     assert a == b
+
+
+def test_unbounded_returns_every_match(tmp_path):
+    """``max_files=0`` means no cap — return every matching file in the tree.
+
+    Regression: previously the handler defaulted to max_files=500 and the
+    skill called it with max_files=5000, both of which truncated real
+    codebases. With ``max_files <= 0`` the walk must be exhaustive.
+    """
+    _make_tree(tmp_path, 7500)
+    files = helpers.collect_source_files(
+        tmp_path, languages=None, max_files=0, max_bytes=1_000_000
+    )
+    assert len(files) == 7500
+
+
+def test_negative_max_files_is_unbounded(tmp_path):
+    """Any non-positive ``max_files`` is treated as unbounded."""
+    _make_tree(tmp_path, 200)
+    files = helpers.collect_source_files(
+        tmp_path, languages=None, max_files=-1, max_bytes=1_000_000
+    )
+    assert len(files) == 200
+
+
+def test_unbounded_still_filters(tmp_path):
+    """Unbounded walk still applies language, size, and IGNORE_DIRS filters."""
+    (tmp_path / "a.py").write_text("x=1")
+    (tmp_path / "b.js").write_text("var x = 1;")
+    (tmp_path / "junk.md").write_text("# readme")
+    (tmp_path / "node_modules").mkdir()
+    (tmp_path / "node_modules" / "ignored.py").write_text("x=1")
+
+    files = helpers.collect_source_files(
+        tmp_path, languages=["python"], max_files=0, max_bytes=1_000_000
+    )
+    assert [p.name for p in files] == ["a.py"]