Skip to content

Commit e18ff83

Browse files
zerone0xClawdbot
andauthored
fix: skip files ignored by gitignore (#78)
Fixes #62 - add pathspec dependency and load project .gitignore - apply gitignore-aware matcher that handles nested rules - add regression test ensuring ignored files stay out of the index Co-authored-by: Clawdbot <bot@clawd.bot>
1 parent 3a59352 commit e18ff83

7 files changed

Lines changed: 172 additions & 8 deletions

File tree

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ dependencies = [
3131
"einops>=0.8.2",
3232
"typer>=0.9.0",
3333
"msgspec>=0.19.0",
34+
"pathspec>=0.12.1",
3435
"pyyaml>=6.0",
3536
]
3637

src/cocoindex_code/indexer.py

Lines changed: 116 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,20 @@
22

33
from __future__ import annotations
44

5+
from pathlib import Path, PurePath
6+
from collections.abc import Iterable
7+
58
import cocoindex as coco
69
from cocoindex.connectors import localfs, sqlite
710
from cocoindex.connectors.sqlite import Vec0TableDef
811
from cocoindex.ops.text import RecursiveSplitter, detect_code_language
912
from cocoindex.resources.chunk import Chunk
10-
from cocoindex.resources.file import PatternFilePathMatcher
13+
from cocoindex.resources.file import FilePathMatcher, PatternFilePathMatcher
1114
from cocoindex.resources.id import IdGenerator
15+
from pathspec import GitIgnoreSpec
1216

1317
from .settings import PROJECT_SETTINGS
14-
from .shared import CODEBASE_DIR, EMBEDDER, EXT_LANG_OVERRIDE_MAP, SQLITE_DB, CodeChunk
18+
from .shared import CODEBASE_DIR, EMBEDDER, EXT_LANG_OVERRIDE_MAP, GITIGNORE_SPEC, SQLITE_DB, CodeChunk
1519

1620
# Chunking configuration
1721
CHUNK_SIZE = 800
@@ -22,6 +26,107 @@
2226
splitter = RecursiveSplitter()
2327

2428

29+
def _normalize_gitignore_lines(lines: Iterable[str], directory: PurePath) -> list[str]:
30+
"""Normalize .gitignore lines to root-relative gitignore patterns."""
31+
if directory in (PurePath("."), PurePath("")):
32+
prefix = ""
33+
else:
34+
prefix = f"{directory.as_posix().rstrip('/')}/"
35+
36+
normalized: list[str] = []
37+
for raw_line in lines:
38+
line = raw_line.rstrip("\n\r")
39+
if not line:
40+
continue
41+
stripped = line.lstrip()
42+
if not stripped or stripped.startswith("#"):
43+
continue
44+
if line.startswith("\\#") or line.startswith("\\!"):
45+
line = line[1:]
46+
negated = line.startswith("!")
47+
if negated:
48+
line = line[1:]
49+
body = line.strip()
50+
if not body:
51+
continue
52+
anchor = body.startswith("/")
53+
if anchor:
54+
body = body.lstrip("/")
55+
pattern = f"{prefix}{body}" if prefix else body
56+
else:
57+
contains_slash = "/" in body
58+
base = prefix
59+
if contains_slash:
60+
pattern = f"{base}{body}"
61+
else:
62+
if base:
63+
pattern = f"{base}**/{body}"
64+
else:
65+
pattern = f"**/{body}"
66+
if negated:
67+
pattern = f"!{pattern}"
68+
normalized.append(pattern)
69+
return normalized
70+
71+
72+
class GitignoreAwareMatcher(FilePathMatcher):
73+
"""Wraps another matcher and applies .gitignore filtering."""
74+
75+
def __init__(
76+
self,
77+
delegate: FilePathMatcher,
78+
root_spec: GitIgnoreSpec | None,
79+
project_root: Path,
80+
) -> None:
81+
self._delegate = delegate
82+
self._root = project_root
83+
self._spec_cache: dict[PurePath, GitIgnoreSpec | None] = {PurePath("."): root_spec}
84+
85+
def _spec_for(self, directory: PurePath) -> GitIgnoreSpec | None:
86+
if directory in self._spec_cache:
87+
return self._spec_cache[directory]
88+
89+
parent_dir = directory.parent if directory != PurePath(".") else PurePath(".")
90+
parent_spec = self._spec_for(parent_dir)
91+
spec = parent_spec
92+
93+
gitignore_path = (self._root / directory) / ".gitignore"
94+
if gitignore_path.is_file():
95+
try:
96+
lines = gitignore_path.read_text().splitlines()
97+
except (OSError, UnicodeDecodeError):
98+
lines = []
99+
normalized = _normalize_gitignore_lines(lines, directory)
100+
if normalized:
101+
new_spec = GitIgnoreSpec.from_lines(normalized)
102+
spec = new_spec if spec is None else spec + new_spec
103+
104+
self._spec_cache[directory] = spec
105+
return spec
106+
107+
def _is_ignored(self, path: PurePath, is_dir: bool) -> bool:
108+
directory = path if is_dir else path.parent
109+
if directory == PurePath(""):
110+
directory = PurePath(".")
111+
spec = self._spec_for(directory)
112+
if spec is None:
113+
return False
114+
match_path = path.as_posix()
115+
if is_dir and not match_path.endswith("/"):
116+
match_path = f"{match_path}/"
117+
return spec.match_file(match_path)
118+
119+
def is_dir_included(self, path: PurePath) -> bool:
120+
if self._is_ignored(path, True):
121+
return False
122+
return self._delegate.is_dir_included(path)
123+
124+
def is_file_included(self, path: PurePath) -> bool:
125+
if self._is_ignored(path, False):
126+
return False
127+
return self._delegate.is_file_included(path)
128+
129+
25130
@coco.fn(memo=True)
26131
async def process_file(
27132
file: localfs.File,
@@ -76,6 +181,8 @@ async def process(chunk: Chunk) -> None:
76181
async def indexer_main() -> None:
77182
"""Main indexing function - walks files and processes each."""
78183
ps = coco.use_context(PROJECT_SETTINGS)
184+
gitignore_spec = coco.use_context(GITIGNORE_SPEC)
185+
project_root = coco.use_context(CODEBASE_DIR)
79186

80187
table = await sqlite.mount_table_target(
81188
db=SQLITE_DB,
@@ -90,13 +197,16 @@ async def indexer_main() -> None:
90197
),
91198
)
92199

200+
base_matcher = PatternFilePathMatcher(
201+
included_patterns=ps.include_patterns,
202+
excluded_patterns=ps.exclude_patterns,
203+
)
204+
matcher: FilePathMatcher = GitignoreAwareMatcher(base_matcher, gitignore_spec, project_root)
205+
93206
files = localfs.walk_dir(
94207
CODEBASE_DIR,
95208
recursive=True,
96-
path_matcher=PatternFilePathMatcher(
97-
included_patterns=ps.include_patterns,
98-
excluded_patterns=ps.exclude_patterns,
99-
),
209+
path_matcher=matcher,
100210
)
101211

102212
with coco.component_subpath(coco.Symbol("process_file")):

src/cocoindex_code/project.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,8 @@
1111

1212
from .indexer import indexer_main
1313
from .protocol import IndexingProgress
14-
from .settings import PROJECT_SETTINGS, ProjectSettings
15-
from .shared import CODEBASE_DIR, EMBEDDER, EXT_LANG_OVERRIDE_MAP, SQLITE_DB, Embedder
14+
from .settings import PROJECT_SETTINGS, ProjectSettings, load_gitignore_spec
15+
from .shared import CODEBASE_DIR, EMBEDDER, EXT_LANG_OVERRIDE_MAP, GITIGNORE_SPEC, SQLITE_DB, Embedder
1616

1717

1818
class Project:
@@ -88,6 +88,7 @@ async def create(
8888
target_sqlite_db_path = index_dir / "target_sqlite.db"
8989

9090
settings = coco.Settings.from_env(cocoindex_db_path)
91+
gitignore_spec = load_gitignore_spec(project_root)
9192

9293
context = coco.ContextProvider()
9394
context.provide(CODEBASE_DIR, project_root)
@@ -98,6 +99,7 @@ async def create(
9899
EXT_LANG_OVERRIDE_MAP,
99100
{f".{lo.ext}": lo.lang for lo in project_settings.language_overrides},
100101
)
102+
context.provide(GITIGNORE_SPEC, gitignore_spec)
101103

102104
env = coco.Environment(settings, context_provider=context)
103105
app = coco.App(

src/cocoindex_code/settings.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import cocoindex as _coco
1010
import yaml as _yaml
11+
from pathspec import GitIgnoreSpec
1112

1213
# ---------------------------------------------------------------------------
1314
# Default file patterns (moved from indexer.py)
@@ -189,6 +190,20 @@ def find_parent_with_marker(start: Path) -> Path | None:
189190
current = parent
190191

191192

193+
def load_gitignore_spec(project_root: Path) -> GitIgnoreSpec | None:
194+
"""Load a GitIgnoreSpec for the project's ``.gitignore`` if present."""
195+
gitignore = project_root / ".gitignore"
196+
if not gitignore.is_file():
197+
return None
198+
try:
199+
lines = gitignore.read_text().splitlines()
200+
except (OSError, UnicodeDecodeError):
201+
return None
202+
if not lines:
203+
return None
204+
return GitIgnoreSpec.from_lines(lines)
205+
206+
192207
# ---------------------------------------------------------------------------
193208
# Serialization helpers
194209
# ---------------------------------------------------------------------------

src/cocoindex_code/shared.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
import cocoindex as coco
1111
from cocoindex.connectors import sqlite
12+
from pathspec import GitIgnoreSpec
1213
from numpy.typing import NDArray
1314

1415
if TYPE_CHECKING:
@@ -31,6 +32,7 @@
3132
EMBEDDER = coco.ContextKey[Embedder]("embedder")
3233
SQLITE_DB = coco.ContextKey[sqlite.ManagedConnection]("index_db", tracked=False)
3334
CODEBASE_DIR = coco.ContextKey[pathlib.Path]("codebase", tracked=False)
35+
GITIGNORE_SPEC = coco.ContextKey[GitIgnoreSpec | None]("gitignore_spec", tracked=False)
3436
EXT_LANG_OVERRIDE_MAP = coco.ContextKey[dict[str, str]]("ext_lang_override_map")
3537

3638
# Module-level variable — set by daemon at startup (needed for CodeChunk annotation).

tests/test_e2e.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import pytest
1717
from typer.testing import CliRunner
1818

19+
from cocoindex.connectors import sqlite as coco_sqlite
1920
from cocoindex_code.cli import app
2021
from cocoindex_code.client import stop_daemon
2122
from cocoindex_code.settings import find_parent_with_marker
@@ -298,6 +299,37 @@ def test_session_reset_then_full_reinit(e2e_project: Path) -> None:
298299
assert "main.py" in result.output
299300

300301

302+
def test_session_respects_gitignore(e2e_project: Path) -> None:
303+
"""Indexing should skip files ignored by .gitignore while honoring negations."""
304+
gitignore_path = e2e_project / ".gitignore"
305+
gitignore_path.write_text("ignored.py\nignored_dir/\n!important.py\n")
306+
307+
(e2e_project / "ignored.py").write_text("IGNORED_TOKEN = True\n")
308+
ignored_dir = e2e_project / "ignored_dir"
309+
ignored_dir.mkdir()
310+
(ignored_dir / "nested.py").write_text("NESTED_IGNORED = True\n")
311+
(e2e_project / "important.py").write_text("IMPORTANT_TOKEN = True\n")
312+
313+
runner.invoke(app, ["init"], catch_exceptions=False)
314+
result = runner.invoke(app, ["index"], catch_exceptions=False)
315+
assert result.exit_code == 0, result.output
316+
317+
db_path = e2e_project / ".cocoindex_code" / "target_sqlite.db"
318+
conn = coco_sqlite.connect(str(db_path), load_vec=True)
319+
try:
320+
with conn.readonly() as db:
321+
file_paths = {
322+
row[0]
323+
for row in db.execute("SELECT DISTINCT file_path FROM code_chunks_vec")
324+
}
325+
finally:
326+
conn.close()
327+
328+
assert "ignored.py" not in file_paths
329+
assert "ignored_dir/nested.py" not in file_paths
330+
assert "important.py" in file_paths
331+
332+
301333
@pytest.mark.usefixtures("e2e_project")
302334
def test_session_daemon_stop_and_auto_start() -> None:
303335
"""Init → index → daemon stop → index auto-starts daemon → search works."""

uv.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)