-
Notifications
You must be signed in to change notification settings - Fork 111
Expand file tree
/
Copy pathindexer.py
More file actions
103 lines (84 loc) · 2.93 KB
/
indexer.py
File metadata and controls
103 lines (84 loc) · 2.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""CocoIndex app for indexing codebases."""
from __future__ import annotations
import cocoindex as coco
from cocoindex.connectors import localfs, sqlite
from cocoindex.connectors.sqlite import Vec0TableDef
from cocoindex.ops.text import RecursiveSplitter, detect_code_language
from cocoindex.resources.chunk import Chunk
from cocoindex.resources.file import PatternFilePathMatcher
from cocoindex.resources.id import IdGenerator
from .settings import PROJECT_SETTINGS
from .shared import CODEBASE_DIR, EMBEDDER, EXT_LANG_OVERRIDE_MAP, SQLITE_DB, CodeChunk
# Chunking configuration
CHUNK_SIZE = 800
MIN_CHUNK_SIZE = 200
CHUNK_OVERLAP = 150
# Chunking splitter (stateless, can be module-level)
splitter = RecursiveSplitter()
@coco.fn(memo=True)
async def process_file(
file: localfs.File,
table: sqlite.TableTarget[CodeChunk],
) -> None:
"""Process a single file: chunk, embed, and store."""
embedder = coco.use_context(EMBEDDER)
try:
content = await file.read_text()
except UnicodeDecodeError:
return
if not content.strip():
return
suffix = file.file_path.path.suffix
ext_lang_override_map = coco.use_context(EXT_LANG_OVERRIDE_MAP)
language = (
ext_lang_override_map.get(suffix)
or detect_code_language(filename=file.file_path.path.name)
or "text"
)
chunks = splitter.split(
content,
chunk_size=CHUNK_SIZE,
min_chunk_size=MIN_CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
language=language,
)
id_gen = IdGenerator()
async def process(chunk: Chunk) -> None:
table.declare_row(
row=CodeChunk(
id=await id_gen.next_id(chunk.text),
file_path=file.file_path.path.as_posix(),
language=language,
content=chunk.text,
start_line=chunk.start.line,
end_line=chunk.end.line,
embedding=await embedder.embed(chunk.text),
)
)
await coco.map(process, chunks)
@coco.fn
async def indexer_main() -> None:
"""Main indexing function - walks files and processes each."""
ps = coco.use_context(PROJECT_SETTINGS)
table = await sqlite.mount_table_target(
db=SQLITE_DB,
table_name="code_chunks_vec",
table_schema=await sqlite.TableSchema.from_class(
CodeChunk,
primary_key=["id"],
),
virtual_table_def=Vec0TableDef(
partition_key_columns=["language"],
auxiliary_columns=["file_path", "content", "start_line", "end_line"],
),
)
files = localfs.walk_dir(
CODEBASE_DIR,
recursive=True,
path_matcher=PatternFilePathMatcher(
included_patterns=ps.include_patterns,
excluded_patterns=ps.exclude_patterns,
),
)
with coco.component_subpath(coco.Symbol("process_file")):
await coco.mount_each(process_file, files.items(), table)