22
33from __future__ import annotations
44
5+ from pathlib import Path , PurePath
6+ from collections .abc import Iterable
7+
58import cocoindex as coco
69from cocoindex .connectors import localfs , sqlite
710from cocoindex .connectors .sqlite import Vec0TableDef
811from cocoindex .ops .text import RecursiveSplitter , detect_code_language
912from cocoindex .resources .chunk import Chunk
10- from cocoindex .resources .file import PatternFilePathMatcher
13+ from cocoindex .resources .file import FilePathMatcher , PatternFilePathMatcher
1114from cocoindex .resources .id import IdGenerator
15+ from pathspec import GitIgnoreSpec
1216
1317from .settings import PROJECT_SETTINGS
14- from .shared import CODEBASE_DIR , EMBEDDER , EXT_LANG_OVERRIDE_MAP , SQLITE_DB , CodeChunk
18+ from .shared import CODEBASE_DIR , EMBEDDER , EXT_LANG_OVERRIDE_MAP , GITIGNORE_SPEC , SQLITE_DB , CodeChunk
1519
1620# Chunking configuration
1721CHUNK_SIZE = 800
2226splitter = RecursiveSplitter ()
2327
2428
29+ def _normalize_gitignore_lines (lines : Iterable [str ], directory : PurePath ) -> list [str ]:
30+ """Normalize .gitignore lines to root-relative gitignore patterns."""
31+ if directory in (PurePath ("." ), PurePath ("" )):
32+ prefix = ""
33+ else :
34+ prefix = f"{ directory .as_posix ().rstrip ('/' )} /"
35+
36+ normalized : list [str ] = []
37+ for raw_line in lines :
38+ line = raw_line .rstrip ("\n \r " )
39+ if not line :
40+ continue
41+ stripped = line .lstrip ()
42+ if not stripped or stripped .startswith ("#" ):
43+ continue
44+ if line .startswith ("\\ #" ) or line .startswith ("\\ !" ):
45+ line = line [1 :]
46+ negated = line .startswith ("!" )
47+ if negated :
48+ line = line [1 :]
49+ body = line .strip ()
50+ if not body :
51+ continue
52+ anchor = body .startswith ("/" )
53+ if anchor :
54+ body = body .lstrip ("/" )
55+ pattern = f"{ prefix } { body } " if prefix else body
56+ else :
57+ contains_slash = "/" in body
58+ base = prefix
59+ if contains_slash :
60+ pattern = f"{ base } { body } "
61+ else :
62+ if base :
63+ pattern = f"{ base } **/{ body } "
64+ else :
65+ pattern = f"**/{ body } "
66+ if negated :
67+ pattern = f"!{ pattern } "
68+ normalized .append (pattern )
69+ return normalized
70+
71+
72+ class GitignoreAwareMatcher (FilePathMatcher ):
73+ """Wraps another matcher and applies .gitignore filtering."""
74+
75+ def __init__ (
76+ self ,
77+ delegate : FilePathMatcher ,
78+ root_spec : GitIgnoreSpec | None ,
79+ project_root : Path ,
80+ ) -> None :
81+ self ._delegate = delegate
82+ self ._root = project_root
83+ self ._spec_cache : dict [PurePath , GitIgnoreSpec | None ] = {PurePath ("." ): root_spec }
84+
85+ def _spec_for (self , directory : PurePath ) -> GitIgnoreSpec | None :
86+ if directory in self ._spec_cache :
87+ return self ._spec_cache [directory ]
88+
89+ parent_dir = directory .parent if directory != PurePath ("." ) else PurePath ("." )
90+ parent_spec = self ._spec_for (parent_dir )
91+ spec = parent_spec
92+
93+ gitignore_path = (self ._root / directory ) / ".gitignore"
94+ if gitignore_path .is_file ():
95+ try :
96+ lines = gitignore_path .read_text ().splitlines ()
97+ except (OSError , UnicodeDecodeError ):
98+ lines = []
99+ normalized = _normalize_gitignore_lines (lines , directory )
100+ if normalized :
101+ new_spec = GitIgnoreSpec .from_lines (normalized )
102+ spec = new_spec if spec is None else spec + new_spec
103+
104+ self ._spec_cache [directory ] = spec
105+ return spec
106+
107+ def _is_ignored (self , path : PurePath , is_dir : bool ) -> bool :
108+ directory = path if is_dir else path .parent
109+ if directory == PurePath ("" ):
110+ directory = PurePath ("." )
111+ spec = self ._spec_for (directory )
112+ if spec is None :
113+ return False
114+ match_path = path .as_posix ()
115+ if is_dir and not match_path .endswith ("/" ):
116+ match_path = f"{ match_path } /"
117+ return spec .match_file (match_path )
118+
119+ def is_dir_included (self , path : PurePath ) -> bool :
120+ if self ._is_ignored (path , True ):
121+ return False
122+ return self ._delegate .is_dir_included (path )
123+
124+ def is_file_included (self , path : PurePath ) -> bool :
125+ if self ._is_ignored (path , False ):
126+ return False
127+ return self ._delegate .is_file_included (path )
128+
129+
25130@coco .fn (memo = True )
26131async def process_file (
27132 file : localfs .File ,
@@ -76,6 +181,8 @@ async def process(chunk: Chunk) -> None:
76181async def indexer_main () -> None :
77182 """Main indexing function - walks files and processes each."""
78183 ps = coco .use_context (PROJECT_SETTINGS )
184+ gitignore_spec = coco .use_context (GITIGNORE_SPEC )
185+ project_root = coco .use_context (CODEBASE_DIR )
79186
80187 table = await sqlite .mount_table_target (
81188 db = SQLITE_DB ,
@@ -90,13 +197,16 @@ async def indexer_main() -> None:
90197 ),
91198 )
92199
200+ base_matcher = PatternFilePathMatcher (
201+ included_patterns = ps .include_patterns ,
202+ excluded_patterns = ps .exclude_patterns ,
203+ )
204+ matcher : FilePathMatcher = GitignoreAwareMatcher (base_matcher , gitignore_spec , project_root )
205+
93206 files = localfs .walk_dir (
94207 CODEBASE_DIR ,
95208 recursive = True ,
96- path_matcher = PatternFilePathMatcher (
97- included_patterns = ps .include_patterns ,
98- excluded_patterns = ps .exclude_patterns ,
99- ),
209+ path_matcher = matcher ,
100210 )
101211
102212 with coco .component_subpath (coco .Symbol ("process_file" )):
0 commit comments