@@ -21,26 +21,29 @@ def _knn_query(
2121 embedding_bytes : bytes ,
2222 k : int ,
2323 language : str | None = None ,
24+ repo_key : str | None = None ,
25+ has_repo_key : bool = False ,
2426) -> list [tuple [Any , ...]]:
2527 """Run a vec0 KNN query, optionally constrained to a language partition."""
28+ conditions = ["embedding MATCH ?" , "k = ?" ]
29+ params : list [Any ] = [embedding_bytes , k ]
30+ if repo_key is not None :
31+ conditions .append ("repo_key = ?" )
32+ params .append (repo_key )
2633 if language is not None :
27- return conn .execute (
28- """
29- SELECT file_path, language, content, start_line, end_line, distance
30- FROM code_chunks_vec
31- WHERE embedding MATCH ? AND k = ? AND language = ?
32- ORDER BY distance
33- """ ,
34- (embedding_bytes , k , language ),
35- ).fetchall ()
34+ conditions .append ("language = ?" )
35+ params .append (language )
36+
37+ repo_key_select = "repo_key" if has_repo_key else "NULL"
3638 return conn .execute (
37- """
38- SELECT file_path, language, content, start_line, end_line, distance
39+ f"""
40+ SELECT file_path, { repo_key_select } as repo_key,
41+ language, content, start_line, end_line, distance
3942 FROM code_chunks_vec
40- WHERE embedding MATCH ? AND k = ?
43+ WHERE { " AND " . join ( conditions ) }
4144 ORDER BY distance
4245 """ ,
43- ( embedding_bytes , k ) ,
46+ params ,
4447 ).fetchall ()
4548
4649
@@ -51,27 +54,42 @@ def _full_scan_query(
5154 offset : int ,
5255 languages : list [str ] | None = None ,
5356 paths : list [str ] | None = None ,
57+ repo_keys : list [str ] | None = None ,
5458) -> list [tuple [Any , ...]]:
5559 """Full scan with SQL-level distance computation and filtering."""
5660 conditions : list [str ] = []
5761 params : list [Any ] = [embedding_bytes ]
5862
63+ has_repo_key = _table_has_column (conn , "code_chunks_vec" , "repo_key" )
64+
5965 if languages :
6066 placeholders = "," .join ("?" for _ in languages )
6167 conditions .append (f"language IN ({ placeholders } )" )
6268 params .extend (languages )
6369
70+ if repo_keys :
71+ if has_repo_key :
72+ placeholders = "," .join ("?" for _ in repo_keys )
73+ conditions .append (f"repo_key IN ({ placeholders } )" )
74+ params .extend (repo_keys )
75+ else :
76+ repo_key_paths = [
77+ f"{ repo_key .rstrip ('/' )} /*" for repo_key in repo_keys if repo_key != "."
78+ ]
79+ paths = [* (paths or []), * repo_key_paths ] or paths
80+
6481 if paths :
6582 path_clauses = " OR " .join ("file_path GLOB ?" for _ in paths )
6683 conditions .append (f"({ path_clauses } )" )
6784 params .extend (paths )
6885
86+ repo_key_select = "repo_key" if has_repo_key else "NULL as repo_key"
6987 where = f"WHERE { ' AND ' .join (conditions )} " if conditions else ""
7088 params .extend ([limit , offset ])
7189
7290 return conn .execute (
7391 f"""
74- SELECT file_path, language, content, start_line, end_line,
92+ SELECT file_path, { repo_key_select } , language, content, start_line, end_line,
7593 vec_distance_L2(embedding, ?) as distance
7694 FROM code_chunks_vec
7795 { where }
@@ -82,6 +100,22 @@ def _full_scan_query(
82100 ).fetchall ()
83101
84102
103+ def _table_has_column (conn : sqlite3 .Connection , table_name : str , column_name : str ) -> bool :
104+ return any (row [1 ] == column_name for row in conn .execute (f"PRAGMA table_info({ table_name } )" ))
105+
106+
107+ def _repo_key_candidates (repo_keys : list [str ] | None ) -> list [str | None ]:
108+ if repo_keys :
109+ return list (repo_keys )
110+ return [None ]
111+
112+
113+ def _language_candidates (languages : list [str ] | None ) -> list [str | None ]:
114+ if languages :
115+ return list (languages )
116+ return [None ]
117+
118+
85119async def query_codebase (
86120 query : str ,
87121 target_sqlite_db_path : Path ,
@@ -90,13 +124,16 @@ async def query_codebase(
90124 offset : int = 0 ,
91125 languages : list [str ] | None = None ,
92126 paths : list [str ] | None = None ,
127+ repo_keys : list [str ] | None = None ,
93128) -> list [QueryResult ]:
94129 """
95130 Perform vector similarity search using vec0 KNN index.
96131
97132 Uses sqlite-vec's vec0 virtual table for indexed nearest-neighbor search.
98133 Language filtering uses vec0 partition keys for exact index-level filtering.
99134 Path filtering triggers a full scan with distance computation.
135+ Repo-key filtering uses the vec0 partition key when available, and
136+ falls back to equivalent path filters for older indexes.
100137 """
101138 if not target_sqlite_db_path .exists ():
102139 raise RuntimeError (
@@ -114,34 +151,46 @@ async def query_codebase(
114151 embedding_bytes = query_embedding .astype ("float32" ).tobytes ()
115152
116153 with db .readonly () as conn :
154+ has_repo_key = _table_has_column (conn , "code_chunks_vec" , "repo_key" )
117155 if paths :
118- rows = _full_scan_query (conn , embedding_bytes , limit , offset , languages , paths )
119- elif not languages or len (languages ) == 1 :
156+ rows = _full_scan_query (
157+ conn , embedding_bytes , limit , offset , languages , paths , repo_keys
158+ )
159+ elif repo_keys and not has_repo_key :
160+ rows = _full_scan_query (
161+ conn , embedding_bytes , limit , offset , languages , None , repo_keys
162+ )
163+ elif (not languages or len (languages ) == 1 ) and (not repo_keys or len (repo_keys ) == 1 ):
120164 lang = languages [0 ] if languages else None
121- rows = _knn_query (conn , embedding_bytes , limit + offset , lang )
165+ repo_key = repo_keys [0 ] if repo_keys else None
166+ rows = _knn_query (conn , embedding_bytes , limit + offset , lang , repo_key , has_repo_key )
122167 else :
123168 fetch_k = limit + offset
124169 rows = heapq .nsmallest (
125170 fetch_k ,
126171 (
127172 row
128- for lang in languages
129- for row in _knn_query (conn , embedding_bytes , fetch_k , lang )
173+ for repo_key in _repo_key_candidates (repo_keys )
174+ for lang in _language_candidates (languages )
175+ for row in _knn_query (
176+ conn , embedding_bytes , fetch_k , lang , repo_key , has_repo_key
177+ )
130178 ),
131- key = lambda r : r [5 ],
179+ key = lambda r : r [6 ],
132180 )
133181
134- if not paths :
182+ if not paths and not ( repo_keys and not has_repo_key ) :
135183 rows = rows [offset :]
136184
137185 return [
138186 QueryResult (
139187 file_path = file_path ,
188+ repo_key = repo_key ,
140189 language = language ,
141190 content = content ,
142191 start_line = start_line ,
143192 end_line = end_line ,
144193 score = _l2_to_score (distance ),
145194 )
146- for file_path , language , content , start_line , end_line , distance in rows
195+ for file_path , repo_key , language , content , start_line , end_line , distance in rows
147196 ]
0 commit comments