From dee8d353911cc732153b4874e06fa62b8d369646 Mon Sep 17 00:00:00 2001 From: Earl Plak Date: Wed, 8 Apr 2026 00:21:14 +0200 Subject: [PATCH 1/3] Respect .gitignore when discovering files In git repositories, filter discovered files through `git check-ignore --stdin` so that paths matched by .gitignore, .git/info/exclude, and the global gitignore are excluded from the index. Falls back gracefully to the existing exclude_patterns logic when the project is not a git repo or git is unavailable. --- src/mcp_codebase_index/project_indexer.py | 49 ++++++++- tests/test_project_indexer.py | 115 ++++++++++++++++++++++ 2 files changed, 163 insertions(+), 1 deletion(-) diff --git a/src/mcp_codebase_index/project_indexer.py b/src/mcp_codebase_index/project_indexer.py index aea32c4..acc573e 100644 --- a/src/mcp_codebase_index/project_indexer.py +++ b/src/mcp_codebase_index/project_indexer.py @@ -26,6 +26,7 @@ import logging import os import re +import subprocess import sys import time from pathlib import Path @@ -336,7 +337,11 @@ def rebuild_graphs(self) -> None: # ------------------------------------------------------------------ def _discover_files(self) -> list[str]: - """Discover files matching include patterns, excluding exclude patterns.""" + """Discover files matching include patterns, excluding exclude patterns. + + In git repositories, paths ignored by .gitignore (and .git/info/exclude, + global gitignore) are also excluded via ``git check-ignore``. + """ root = Path(self.root_path) matched: set[str] = set() @@ -360,6 +365,15 @@ def _discover_files(self) -> list[str]: matched.add(abs_str) + # In git repos, also honour .gitignore / .git/info/exclude / global + # gitignore by asking git which of the discovered paths it would ignore. + git_ignored = self._get_git_ignored_paths(matched) + if git_ignored: + logger.info( + "Excluding %d git-ignored files from index", len(git_ignored), + ) + matched -= git_ignored + return sorted(matched) def _is_excluded(self, rel_path: str) -> bool: @@ -378,6 +392,39 @@ def _is_excluded(self, rel_path: str) -> bool: return True return False + def _get_git_ignored_paths(self, paths: set[str]) -> set[str]: + """Return the subset of *paths* that git considers ignored. + + Uses ``git check-ignore --stdin`` which respects ``.gitignore``, + ``.git/info/exclude``, and the global gitignore. Returns an empty + set when the project is not a git repository or git is unavailable. + """ + if not paths: + return set() + try: + result = subprocess.run( + ["git", "check-ignore", "--stdin"], + input="\n".join(paths), + cwd=self.root_path, + capture_output=True, + text=True, + timeout=30, + ) + except (FileNotFoundError, subprocess.TimeoutExpired): + return set() + + # git check-ignore exits 0 when at least one path is ignored, + # 1 when none are ignored, and 128 on error. + if result.returncode not in (0, 1): + return set() + + ignored: set[str] = set() + for line in result.stdout.splitlines(): + line = line.strip() + if line: + ignored.add(line) + return ignored + def _read_file(self, abs_path: str) -> str: """Read a file as text, trying UTF-8 first then latin-1 as fallback.""" try: diff --git a/tests/test_project_indexer.py b/tests/test_project_indexer.py index 3e4f160..da1e6c0 100644 --- a/tests/test_project_indexer.py +++ b/tests/test_project_indexer.py @@ -658,6 +658,121 @@ def test_rust_import_resolution(self, rust_project): assert models_path in imports or utils_path in imports +# --------------------------------------------------------------------------- +# Test: .gitignore / git-ignore filtering +# --------------------------------------------------------------------------- + + +@pytest.fixture +def git_project(tmp_path): + """Create a project inside a git repo with a .gitignore. + + Structure: + repo/ + .gitignore (ignores build/ and *.log) + src/ + app.py + build/ + output.py (should be git-ignored) + debug.log (should be git-ignored) + README.md + """ + import subprocess + + root = tmp_path / "repo" + root.mkdir() + + # Initialise a git repo so git check-ignore works. + subprocess.run( + ["git", "init"], cwd=str(root), capture_output=True, check=True, + ) + subprocess.run( + ["git", "config", "user.email", "test@test.com"], + cwd=str(root), capture_output=True, check=True, + ) + subprocess.run( + ["git", "config", "user.name", "Test"], + cwd=str(root), capture_output=True, check=True, + ) + + # .gitignore + (root / ".gitignore").write_text("build/\n*.log\n") + + # Source file (not ignored) + src = root / "src" + src.mkdir() + (src / "app.py").write_text("def main():\n pass\n") + + # Build artefact (git-ignored) + build = root / "build" + build.mkdir() + (build / "output.py").write_text("x = 1\n") + + # Log file (git-ignored) + (root / "debug.log").write_text("some log\n") + + # README (not ignored) + (root / "README.md").write_text("# Repo\n") + + return root + + +class TestGitIgnore: + """Verify that _get_git_ignored_paths and _discover_files honour + .gitignore when the project is a git repository.""" + + def test_git_ignored_paths_detected(self, git_project): + indexer = ProjectIndexer(str(git_project)) + # Build a set of all candidate paths (before filtering). + all_paths: set[str] = set() + for pattern in indexer.include_patterns: + from pathlib import Path + + for p in Path(str(git_project)).glob(pattern): + if p.is_file(): + all_paths.add(str(p)) + + ignored = indexer._get_git_ignored_paths(all_paths) + ignored_names = {os.path.basename(p) for p in ignored} + + assert "output.py" in ignored_names, "build/output.py should be git-ignored" + + def test_discover_files_excludes_git_ignored(self, git_project): + indexer = ProjectIndexer(str(git_project)) + idx = indexer.index() + + filenames = {os.path.basename(f) for f in idx.files} + assert "app.py" in filenames, "src/app.py should be indexed" + assert "README.md" in filenames, "README.md should be indexed" + assert "output.py" not in filenames, "build/output.py should be excluded" + + def test_non_git_project_no_filtering(self, sample_project): + """In a non-git directory, _get_git_ignored_paths returns empty.""" + indexer = ProjectIndexer(str(sample_project)) + ignored = indexer._get_git_ignored_paths({str(sample_project / "README.md")}) + assert ignored == set() + + def test_gitignore_with_nested_patterns(self, git_project): + """Additional patterns added to .gitignore are also respected.""" + import subprocess + + # Add a new ignore pattern + gitignore = git_project / ".gitignore" + gitignore.write_text(gitignore.read_text() + "src/generated/\n") + + # Create the directory with a file + gen = git_project / "src" / "generated" + gen.mkdir() + (gen / "auto.py").write_text("y = 2\n") + + indexer = ProjectIndexer(str(git_project)) + idx = indexer.index() + + filenames = {os.path.basename(f) for f in idx.files} + assert "auto.py" not in filenames, "generated/auto.py should be excluded" + assert "app.py" in filenames, "src/app.py should still be indexed" + + class TestIntegration: def test_index_mcp_codebase_index_source(self): """Index the actual mcp-codebase-index src directory as an integration test.""" From 1b77b6be709db4149eb3df6502ac0dc092fb9d1c Mon Sep 17 00:00:00 2001 From: Earl Plak Date: Wed, 8 Apr 2026 00:45:47 +0200 Subject: [PATCH 2/3] Support multi-repo PROJECT_ROOT for .gitignore filtering When PROJECT_ROOT spans multiple git repositories (or is not itself a git repo), group discovered files by their containing repo and run git check-ignore once per repo. This ensures each repo's .gitignore is respected even when the indexed directory is a parent of several independent repos. Add _find_git_root() helper and a multi-repo test case with three independent repos under a shared parent directory. --- src/mcp_codebase_index/project_indexer.py | 88 +++++++++++++++++------ tests/test_project_indexer.py | 46 ++++++++++++ 2 files changed, 113 insertions(+), 21 deletions(-) diff --git a/src/mcp_codebase_index/project_indexer.py b/src/mcp_codebase_index/project_indexer.py index acc573e..1c3b6ea 100644 --- a/src/mcp_codebase_index/project_indexer.py +++ b/src/mcp_codebase_index/project_indexer.py @@ -392,37 +392,83 @@ def _is_excluded(self, rel_path: str) -> bool: return True return False - def _get_git_ignored_paths(self, paths: set[str]) -> set[str]: - """Return the subset of *paths* that git considers ignored. - - Uses ``git check-ignore --stdin`` which respects ``.gitignore``, - ``.git/info/exclude``, and the global gitignore. Returns an empty - set when the project is not a git repository or git is unavailable. - """ - if not paths: - return set() + @staticmethod + def _find_git_root(path: str) -> str | None: + """Return the git work-tree root for *path*, or ``None``.""" try: result = subprocess.run( - ["git", "check-ignore", "--stdin"], - input="\n".join(paths), - cwd=self.root_path, + ["git", "rev-parse", "--show-toplevel"], + cwd=path, capture_output=True, text=True, - timeout=30, + timeout=10, ) + if result.returncode == 0: + return result.stdout.strip() except (FileNotFoundError, subprocess.TimeoutExpired): - return set() + pass + return None + + def _get_git_ignored_paths(self, paths: set[str]) -> set[str]: + """Return the subset of *paths* that git considers ignored. - # git check-ignore exits 0 when at least one path is ignored, - # 1 when none are ignored, and 128 on error. - if result.returncode not in (0, 1): + Groups paths by their containing git repository and runs + ``git check-ignore --stdin`` once per repo, so that per-repo + ``.gitignore`` files are respected even when ``PROJECT_ROOT`` + spans multiple repositories or is not itself a git repo. + + Returns an empty set when no paths are inside a git repository + or git is unavailable. + """ + if not paths: return set() + # Group paths by their git repo root. + repos: dict[str, list[str]] = {} # repo_root -> [abs_paths] + no_repo_dirs: set[str] = set() # dirs already known to lack a repo + for abs_path in paths: + dir_path = os.path.dirname(abs_path) + # Fast-path: skip lookup if we already know this dir has no repo. + if dir_path in no_repo_dirs: + continue + # Check cache of known repo roots. + repo_root: str | None = None + for known_root in repos: + if abs_path.startswith(known_root + os.sep): + repo_root = known_root + break + if repo_root is None: + repo_root = self._find_git_root(dir_path) + if repo_root is None: + no_repo_dirs.add(dir_path) + continue + repos.setdefault(repo_root, []).append(abs_path) + + # Run git check-ignore once per repo. ignored: set[str] = set() - for line in result.stdout.splitlines(): - line = line.strip() - if line: - ignored.add(line) + for repo_root, repo_paths in repos.items(): + try: + result = subprocess.run( + ["git", "check-ignore", "--stdin"], + input="\n".join(repo_paths), + cwd=repo_root, + capture_output=True, + text=True, + timeout=30, + ) + except (FileNotFoundError, subprocess.TimeoutExpired): + continue + + # git check-ignore exits 0 when at least one path is ignored, + # 1 when none are ignored, and 128 on error. + if result.returncode not in (0, 1): + continue + + for line in result.stdout.splitlines(): + line = line.strip() + if line: + ignored.add(line) + return ignored def _read_file(self, abs_path: str) -> str: diff --git a/tests/test_project_indexer.py b/tests/test_project_indexer.py index da1e6c0..46f0598 100644 --- a/tests/test_project_indexer.py +++ b/tests/test_project_indexer.py @@ -752,6 +752,52 @@ def test_non_git_project_no_filtering(self, sample_project): ignored = indexer._get_git_ignored_paths({str(sample_project / "README.md")}) assert ignored == set() + def test_multi_repo_parent_directory(self, tmp_path): + """PROJECT_ROOT spans multiple git repos, each with its own .gitignore.""" + import subprocess + + parent = tmp_path / "code" + parent.mkdir() + + def _init_repo(name, gitignore, src_file, ignored_dir, ignored_file): + repo = parent / name + repo.mkdir() + subprocess.run( + ["git", "init"], cwd=str(repo), capture_output=True, check=True, + ) + subprocess.run( + ["git", "config", "user.email", "test@test.com"], + cwd=str(repo), capture_output=True, check=True, + ) + subprocess.run( + ["git", "config", "user.name", "Test"], + cwd=str(repo), capture_output=True, check=True, + ) + (repo / ".gitignore").write_text(gitignore) + src = repo / "src" + src.mkdir() + (src / src_file).write_text("x = 1\n") + ign = repo / ignored_dir + ign.mkdir() + (ign / ignored_file).write_text("x = 1\n") + + _init_repo("alpha", "dist/\n", "main.py", "dist", "bundle.py") + _init_repo("beta", "build/\n", "app.py", "build", "output.py") + _init_repo("gamma", ".next/\n", "index.py", ".next", "cache.py") + + indexer = ProjectIndexer(str(parent)) + idx = indexer.index() + + filenames = {os.path.basename(f) for f in idx.files} + # Source files should be indexed. + assert "main.py" in filenames + assert "app.py" in filenames + assert "index.py" in filenames + # Git-ignored files should be excluded. + assert "bundle.py" not in filenames, "alpha/dist/bundle.py should be excluded" + assert "output.py" not in filenames, "beta/build/output.py should be excluded" + assert "cache.py" not in filenames, "gamma/.next/cache.py should be excluded" + def test_gitignore_with_nested_patterns(self, git_project): """Additional patterns added to .gitignore are also respected.""" import subprocess From a2c40f5d27128ecbb9efe10c2f83e5ce5d4e1580 Mon Sep 17 00:00:00 2001 From: Earl Plak Date: Wed, 8 Apr 2026 00:48:26 +0200 Subject: [PATCH 3/3] Apply git-ignore filtering during incremental updates The incremental update path in _maybe_incremental_update() only checked the hardcoded exclude_patterns, not .gitignore. Collect candidate paths from the changeset, batch-check them via _get_git_ignored_paths(), and skip any that are git-ignored before reindexing. --- src/mcp_codebase_index/server.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/mcp_codebase_index/server.py b/src/mcp_codebase_index/server.py index 72126da..8b3154b 100644 --- a/src/mcp_codebase_index/server.py +++ b/src/mcp_codebase_index/server.py @@ -319,7 +319,9 @@ def _maybe_incremental_update() -> None: if path in idx.files: _indexer.remove_file(path) - # Process modifications and additions + # Process modifications and additions — collect candidates first, + # then filter out git-ignored paths in one batch call. + candidates: list[tuple[str, str]] = [] # (rel_path, abs_path) for path in changeset.modified + changeset.added: if _indexer._is_excluded(path): continue @@ -328,7 +330,15 @@ def _maybe_incremental_update() -> None: abs_path = os.path.join(_project_root, path) if not os.path.isfile(abs_path): continue - _indexer.reindex_file(path, skip_graph_rebuild=True) + candidates.append((path, abs_path)) + + git_ignored = _indexer._get_git_ignored_paths( + {abs_path for _, abs_path in candidates}, + ) + for rel_path, abs_path in candidates: + if abs_path in git_ignored: + continue + _indexer.reindex_file(rel_path, skip_graph_rebuild=True) # Rebuild cross-file graphs once _indexer.rebuild_graphs()