Skip to content

Commit 9e84e94

Browse files
committed
Add repo key
1 parent 589b07d commit 9e84e94

13 files changed

Lines changed: 158 additions & 24 deletions

File tree

src/cocoindex_code/cli.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ def __call__(
104104
query: str,
105105
languages: list[str] | None = None,
106106
paths: list[str] | None = None,
107+
repo_keys: list[str] | None = None,
107108
limit: int = 5,
108109
offset: int = 0,
109110
on_waiting: Callable[[], None] | None = None,
@@ -194,6 +195,7 @@ def search_response_json_payload(response: SearchResponse) -> dict[str, object]:
194195
"results": [
195196
{
196197
"file_path": r.file_path,
198+
"repo_key": r.repo_key,
197199
"language": r.language,
198200
"content": r.content,
199201
"start_line": r.start_line,
@@ -317,6 +319,7 @@ def handle_bridge_jsonrpc_request(
317319
query=_required_str(params, "query"),
318320
languages=_optional_str_list(params, "languages"),
319321
paths=_optional_str_list(params, "paths"),
322+
repo_keys=_optional_str_list(params, "repo_keys"),
320323
limit=_positive_int_param(params, "limit", 10),
321324
offset=_non_negative_int_param(params, "offset", 0),
322325
)
@@ -405,6 +408,7 @@ def _search_with_wait_spinner(
405408
query: str,
406409
languages: list[str] | None = None,
407410
paths: list[str] | None = None,
411+
repo_keys: list[str] | None = None,
408412
limit: int = 10,
409413
offset: int = 0,
410414
) -> SearchResponse:
@@ -430,6 +434,7 @@ def _on_waiting() -> None:
430434
query=query,
431435
languages=languages,
432436
paths=paths,
437+
repo_keys=repo_keys,
433438
limit=limit,
434439
offset=offset,
435440
on_waiting=_on_waiting,
@@ -722,6 +727,7 @@ def search(
722727
query: list[str] = _typer.Argument(..., help="Search query"),
723728
lang: list[str] = _typer.Option([], "--lang", help="Filter by language"),
724729
path: str | None = _typer.Option(None, "--path", help="Filter by file path glob"),
730+
repo_key: list[str] = _typer.Option([], "--repo-key", help="Filter by indexed repo key"),
725731
offset: int = _typer.Option(0, "--offset", help="Number of results to skip"),
726732
limit: int = _typer.Option(10, "--limit", help="Maximum results to return"),
727733
refresh: bool = _typer.Option(False, "--refresh", help="Refresh index before searching"),
@@ -748,6 +754,7 @@ def search(
748754
query=query_str,
749755
languages=lang or None,
750756
paths=paths,
757+
repo_keys=repo_key or None,
751758
limit=limit,
752759
offset=offset,
753760
)

src/cocoindex_code/client.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ def search(
278278
query: str,
279279
languages: list[str] | None = None,
280280
paths: list[str] | None = None,
281+
repo_keys: list[str] | None = None,
281282
limit: int = 5,
282283
offset: int = 0,
283284
on_waiting: Callable[[], None] | None = None,
@@ -298,6 +299,7 @@ def search(
298299
query=query,
299300
languages=languages,
300301
paths=paths,
302+
repo_keys=repo_keys,
301303
limit=limit,
302304
offset=offset,
303305
)

src/cocoindex_code/daemon.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -275,6 +275,7 @@ async def _search_with_wait(
275275
query=req.query,
276276
languages=req.languages,
277277
paths=req.paths,
278+
repo_keys=req.repo_keys,
278279
limit=req.limit,
279280
offset=req.offset,
280281
)
@@ -488,6 +489,7 @@ async def _dispatch(
488489
query=req.query,
489490
languages=req.languages,
490491
paths=req.paths,
492+
repo_keys=req.repo_keys,
491493
limit=req.limit,
492494
offset=req.offset,
493495
)

src/cocoindex_code/indexer.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,22 @@
3333
splitter = RecursiveSplitter()
3434

3535

36+
def repo_key_for_path(file_path: PurePath, project_root: Path) -> str:
37+
"""Return the relative Git repo root for fast scoped search."""
38+
directory = file_path.parent
39+
while True:
40+
if (project_root / directory / ".git").exists():
41+
repo_key = directory.as_posix()
42+
return repo_key if repo_key != "." else "."
43+
44+
if directory in (PurePath("."), PurePath("")):
45+
break
46+
directory = directory.parent
47+
48+
parts = file_path.parts
49+
return parts[0] if len(parts) > 1 else "."
50+
51+
3652
def _normalize_gitignore_lines(lines: Iterable[str], directory: PurePath) -> list[str]:
3753
"""Normalize .gitignore lines to root-relative gitignore patterns."""
3854
if directory in (PurePath("."), PurePath("")):
@@ -151,8 +167,9 @@ async def process_file(
151167
if not content.strip():
152168
return
153169

154-
suffix = file.file_path.path.suffix
155170
project_root = coco.use_context(CODEBASE_DIR)
171+
suffix = file.file_path.path.suffix
172+
repo_key = repo_key_for_path(file.file_path.path, project_root)
156173
ps = load_project_settings(project_root)
157174
ext_lang_map = {f".{lo.ext}": lo.lang for lo in ps.language_overrides}
158175
language = (
@@ -183,6 +200,7 @@ async def process(chunk: Chunk) -> None:
183200
row=CodeChunk(
184201
id=await id_gen.next_id(chunk.text),
185202
file_path=file.file_path.path.as_posix(),
203+
repo_key=repo_key,
186204
language=language,
187205
content=chunk.text,
188206
start_line=chunk.start.line,
@@ -209,7 +227,7 @@ async def indexer_main() -> None:
209227
primary_key=["id"],
210228
),
211229
virtual_table_def=Vec0TableDef(
212-
partition_key_columns=["language"],
230+
partition_key_columns=["repo_key", "language"],
213231
auxiliary_columns=["file_path", "content", "start_line", "end_line"],
214232
),
215233
)

src/cocoindex_code/project.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ async def search(
179179
query: str,
180180
languages: list[str] | None = None,
181181
paths: list[str] | None = None,
182+
repo_keys: list[str] | None = None,
182183
limit: int = 5,
183184
offset: int = 0,
184185
) -> list[SearchResult]:
@@ -192,10 +193,12 @@ async def search(
192193
offset=offset,
193194
languages=languages,
194195
paths=paths,
196+
repo_keys=repo_keys,
195197
)
196198
return [
197199
SearchResult(
198200
file_path=r.file_path,
201+
repo_key=r.repo_key,
199202
language=r.language,
200203
content=r.content,
201204
start_line=r.start_line,

src/cocoindex_code/protocol.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ class SearchRequest(_msgspec.Struct, tag="search"):
2222
query: str
2323
languages: list[str] | None = None
2424
paths: list[str] | None = None
25+
repo_keys: list[str] | None = None
2526
limit: int = 5
2627
offset: int = 0
2728

@@ -111,6 +112,7 @@ class SearchResult(_msgspec.Struct):
111112
start_line: int
112113
end_line: int
113114
score: float
115+
repo_key: str | None = None
114116

115117

116118
class SearchResponse(_msgspec.Struct, tag="search"):

src/cocoindex_code/query.py

Lines changed: 71 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -21,26 +21,29 @@ def _knn_query(
2121
embedding_bytes: bytes,
2222
k: int,
2323
language: str | None = None,
24+
repo_key: str | None = None,
25+
has_repo_key: bool = False,
2426
) -> list[tuple[Any, ...]]:
2527
"""Run a vec0 KNN query, optionally constrained to a language partition."""
28+
conditions = ["embedding MATCH ?", "k = ?"]
29+
params: list[Any] = [embedding_bytes, k]
30+
if repo_key is not None:
31+
conditions.append("repo_key = ?")
32+
params.append(repo_key)
2633
if language is not None:
27-
return conn.execute(
28-
"""
29-
SELECT file_path, language, content, start_line, end_line, distance
30-
FROM code_chunks_vec
31-
WHERE embedding MATCH ? AND k = ? AND language = ?
32-
ORDER BY distance
33-
""",
34-
(embedding_bytes, k, language),
35-
).fetchall()
34+
conditions.append("language = ?")
35+
params.append(language)
36+
37+
repo_key_select = "repo_key" if has_repo_key else "NULL"
3638
return conn.execute(
37-
"""
38-
SELECT file_path, language, content, start_line, end_line, distance
39+
f"""
40+
SELECT file_path, {repo_key_select} as repo_key,
41+
language, content, start_line, end_line, distance
3942
FROM code_chunks_vec
40-
WHERE embedding MATCH ? AND k = ?
43+
WHERE {" AND ".join(conditions)}
4144
ORDER BY distance
4245
""",
43-
(embedding_bytes, k),
46+
params,
4447
).fetchall()
4548

4649

@@ -51,27 +54,42 @@ def _full_scan_query(
5154
offset: int,
5255
languages: list[str] | None = None,
5356
paths: list[str] | None = None,
57+
repo_keys: list[str] | None = None,
5458
) -> list[tuple[Any, ...]]:
5559
"""Full scan with SQL-level distance computation and filtering."""
5660
conditions: list[str] = []
5761
params: list[Any] = [embedding_bytes]
5862

63+
has_repo_key = _table_has_column(conn, "code_chunks_vec", "repo_key")
64+
5965
if languages:
6066
placeholders = ",".join("?" for _ in languages)
6167
conditions.append(f"language IN ({placeholders})")
6268
params.extend(languages)
6369

70+
if repo_keys:
71+
if has_repo_key:
72+
placeholders = ",".join("?" for _ in repo_keys)
73+
conditions.append(f"repo_key IN ({placeholders})")
74+
params.extend(repo_keys)
75+
else:
76+
repo_key_paths = [
77+
f"{repo_key.rstrip('/')}/*" for repo_key in repo_keys if repo_key != "."
78+
]
79+
paths = [*(paths or []), *repo_key_paths] or paths
80+
6481
if paths:
6582
path_clauses = " OR ".join("file_path GLOB ?" for _ in paths)
6683
conditions.append(f"({path_clauses})")
6784
params.extend(paths)
6885

86+
repo_key_select = "repo_key" if has_repo_key else "NULL as repo_key"
6987
where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
7088
params.extend([limit, offset])
7189

7290
return conn.execute(
7391
f"""
74-
SELECT file_path, language, content, start_line, end_line,
92+
SELECT file_path, {repo_key_select}, language, content, start_line, end_line,
7593
vec_distance_L2(embedding, ?) as distance
7694
FROM code_chunks_vec
7795
{where}
@@ -82,6 +100,22 @@ def _full_scan_query(
82100
).fetchall()
83101

84102

103+
def _table_has_column(conn: sqlite3.Connection, table_name: str, column_name: str) -> bool:
104+
return any(row[1] == column_name for row in conn.execute(f"PRAGMA table_info({table_name})"))
105+
106+
107+
def _repo_key_candidates(repo_keys: list[str] | None) -> list[str | None]:
108+
if repo_keys:
109+
return list(repo_keys)
110+
return [None]
111+
112+
113+
def _language_candidates(languages: list[str] | None) -> list[str | None]:
114+
if languages:
115+
return list(languages)
116+
return [None]
117+
118+
85119
async def query_codebase(
86120
query: str,
87121
target_sqlite_db_path: Path,
@@ -90,13 +124,16 @@ async def query_codebase(
90124
offset: int = 0,
91125
languages: list[str] | None = None,
92126
paths: list[str] | None = None,
127+
repo_keys: list[str] | None = None,
93128
) -> list[QueryResult]:
94129
"""
95130
Perform vector similarity search using vec0 KNN index.
96131
97132
Uses sqlite-vec's vec0 virtual table for indexed nearest-neighbor search.
98133
Language filtering uses vec0 partition keys for exact index-level filtering.
99134
Path filtering triggers a full scan with distance computation.
135+
Repo-key filtering uses the vec0 partition key when available, and
136+
falls back to equivalent path filters for older indexes.
100137
"""
101138
if not target_sqlite_db_path.exists():
102139
raise RuntimeError(
@@ -114,34 +151,46 @@ async def query_codebase(
114151
embedding_bytes = query_embedding.astype("float32").tobytes()
115152

116153
with db.readonly() as conn:
154+
has_repo_key = _table_has_column(conn, "code_chunks_vec", "repo_key")
117155
if paths:
118-
rows = _full_scan_query(conn, embedding_bytes, limit, offset, languages, paths)
119-
elif not languages or len(languages) == 1:
156+
rows = _full_scan_query(
157+
conn, embedding_bytes, limit, offset, languages, paths, repo_keys
158+
)
159+
elif repo_keys and not has_repo_key:
160+
rows = _full_scan_query(
161+
conn, embedding_bytes, limit, offset, languages, None, repo_keys
162+
)
163+
elif (not languages or len(languages) == 1) and (not repo_keys or len(repo_keys) == 1):
120164
lang = languages[0] if languages else None
121-
rows = _knn_query(conn, embedding_bytes, limit + offset, lang)
165+
repo_key = repo_keys[0] if repo_keys else None
166+
rows = _knn_query(conn, embedding_bytes, limit + offset, lang, repo_key, has_repo_key)
122167
else:
123168
fetch_k = limit + offset
124169
rows = heapq.nsmallest(
125170
fetch_k,
126171
(
127172
row
128-
for lang in languages
129-
for row in _knn_query(conn, embedding_bytes, fetch_k, lang)
173+
for repo_key in _repo_key_candidates(repo_keys)
174+
for lang in _language_candidates(languages)
175+
for row in _knn_query(
176+
conn, embedding_bytes, fetch_k, lang, repo_key, has_repo_key
177+
)
130178
),
131-
key=lambda r: r[5],
179+
key=lambda r: r[6],
132180
)
133181

134-
if not paths:
182+
if not paths and not (repo_keys and not has_repo_key):
135183
rows = rows[offset:]
136184

137185
return [
138186
QueryResult(
139187
file_path=file_path,
188+
repo_key=repo_key,
140189
language=language,
141190
content=content,
142191
start_line=start_line,
143192
end_line=end_line,
144193
score=_l2_to_score(distance),
145194
)
146-
for file_path, language, content, start_line, end_line, distance in rows
195+
for file_path, repo_key, language, content, start_line, end_line, distance in rows
147196
]

src/cocoindex_code/schema.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ class CodeChunk:
1010

1111
id: int
1212
file_path: str
13+
repo_key: str
1314
language: str
1415
content: str
1516
start_line: int
@@ -22,6 +23,7 @@ class QueryResult:
2223
"""Result from a vector similarity query."""
2324

2425
file_path: str
26+
repo_key: str | None
2527
language: str
2628
content: str
2729
start_line: int

0 commit comments

Comments
 (0)