11"""Query implementation for codebase search."""
22
3- import fnmatch
3+ import heapq
4+ import sqlite3
5+ from typing import Any
46
57import cocoindex as coco
68
79from .config import config
810from .schema import QueryResult
911from .shared import SQLITE_DB , embedder
1012
11- # Over-fetch multiplier when post-filtering is needed
12- _FILTER_OVERFETCH = 5
13- _FILTER_MIN_K = 200
13+
14+ def _l2_to_score (distance : float ) -> float :
15+ """Convert L2 distance to cosine similarity (exact for unit vectors)."""
16+ return 1.0 - distance * distance / 2.0
17+
18+
19+ def _knn_query (
20+ conn : sqlite3 .Connection ,
21+ embedding_bytes : bytes ,
22+ k : int ,
23+ language : str | None = None ,
24+ ) -> list [tuple [Any , ...]]:
25+ """Run a vec0 KNN query, optionally constrained to a language partition."""
26+ if language is not None :
27+ return conn .execute (
28+ """
29+ SELECT file_path, language, content, start_line, end_line, distance
30+ FROM code_chunks_vec
31+ WHERE embedding MATCH ? AND k = ? AND language = ?
32+ ORDER BY distance
33+ """ ,
34+ (embedding_bytes , k , language ),
35+ ).fetchall ()
36+ return conn .execute (
37+ """
38+ SELECT file_path, language, content, start_line, end_line, distance
39+ FROM code_chunks_vec
40+ WHERE embedding MATCH ? AND k = ?
41+ ORDER BY distance
42+ """ ,
43+ (embedding_bytes , k ),
44+ ).fetchall ()
45+
46+
47+ def _full_scan_query (
48+ conn : sqlite3 .Connection ,
49+ embedding_bytes : bytes ,
50+ limit : int ,
51+ offset : int ,
52+ languages : list [str ] | None = None ,
53+ paths : list [str ] | None = None ,
54+ ) -> list [tuple [Any , ...]]:
55+ """Full scan with SQL-level distance computation and filtering."""
56+ conditions : list [str ] = []
57+ params : list [Any ] = [embedding_bytes ]
58+
59+ if languages :
60+ placeholders = "," .join ("?" for _ in languages )
61+ conditions .append (f"language IN ({ placeholders } )" )
62+ params .extend (languages )
63+
64+ if paths :
65+ path_clauses = " OR " .join ("file_path GLOB ?" for _ in paths )
66+ conditions .append (f"({ path_clauses } )" )
67+ params .extend (paths )
68+
69+ where = f"WHERE { ' AND ' .join (conditions )} " if conditions else ""
70+ params .extend ([limit , offset ])
71+
72+ return conn .execute (
73+ f"""
74+ SELECT file_path, language, content, start_line, end_line,
75+ vec_distance_L2(embedding, ?) as distance
76+ FROM code_chunks_vec
77+ { where }
78+ ORDER BY distance
79+ LIMIT ? OFFSET ?
80+ """ ,
81+ params ,
82+ ).fetchall ()
1483
1584
1685async def query_codebase (
@@ -24,76 +93,59 @@ async def query_codebase(
2493 Perform vector similarity search using vec0 KNN index.
2594
2695 Uses sqlite-vec's vec0 virtual table for indexed nearest-neighbor search.
27- Optionally filters by language(s) and/or file path glob pattern(s).
96+ Language filtering uses vec0 partition keys for exact index-level filtering.
97+ Path filtering triggers a full scan with distance computation.
2898 """
2999 if not config .target_sqlite_db_path .exists ():
30100 raise RuntimeError (
31101 f"Index database not found at { config .target_sqlite_db_path } . "
32102 "Please run a query with refresh_index=True first."
33103 )
34104
35- # Get the database connection from CocoIndex environment
36105 coco_env = await coco .default_env ()
37106 db = coco_env .get_context (SQLITE_DB )
38107
39- # Generate query embedding — use embed_query if available (supports asymmetric
40- # prompting for models like nomic-embed-code that use different prefixes for
41- # queries vs indexed documents).
108+ # Generate query embedding.
42109 if hasattr (embedder , "embed_query" ):
43110 query_embedding = await embedder .embed_query (query )
44111 else :
45112 query_embedding = await embedder .embed (query )
46113
47- # Convert to bytes for sqlite-vec (float32)
48114 embedding_bytes = query_embedding .astype ("float32" ).tobytes ()
49115
50- # vec0 KNN queries don't support arbitrary WHERE/OFFSET, so we
51- # over-fetch when post-filtering is needed and apply filters in Python.
52- needs_post_filter = bool (languages or paths )
53- if needs_post_filter :
54- fetch_k = max ((limit + offset ) * _FILTER_OVERFETCH , _FILTER_MIN_K )
55- else :
56- fetch_k = limit + offset
57-
58- # Query using vec0 KNN index with readonly transaction.
59- # vec0 returns L2 distance; for normalized embeddings the ranking is
60- # identical to cosine distance. Convert to cosine similarity via
61- # cos_sim = 1 - L2² / 2 (exact for unit vectors).
62116 with db .value .readonly () as conn :
63- cursor = conn .execute (
64- """
65- SELECT
66- file_path,
67- language,
68- content,
69- start_line,
70- end_line,
71- distance
72- FROM code_chunks_vec
73- WHERE embedding MATCH ? AND k = ?
74- ORDER BY distance
75- """ ,
76- (embedding_bytes , fetch_k ),
77- )
78- rows = cursor .fetchall ()
79-
80- language_set = set (languages ) if languages else None
81- results : list [QueryResult ] = []
82-
83- for file_path , language , content , start_line , end_line , distance in rows :
84- if language_set and language not in language_set :
85- continue
86- if paths and not any (fnmatch .fnmatch (file_path , p ) for p in paths ):
87- continue
88- results .append (
89- QueryResult (
90- file_path = file_path ,
91- language = language ,
92- content = content ,
93- start_line = start_line ,
94- end_line = end_line ,
95- score = 1.0 - distance * distance / 2.0 ,
117+ if paths :
118+ # Path filter → full scan (vec0 can't filter on auxiliary columns).
119+ # LIMIT/OFFSET handled in SQL.
120+ rows = _full_scan_query (conn , embedding_bytes , limit , offset , languages , paths )
121+ elif not languages or len (languages ) == 1 :
122+ # Single language or no filter: one KNN query.
123+ lang = languages [0 ] if languages else None
124+ rows = _knn_query (conn , embedding_bytes , limit + offset , lang )
125+ else :
126+ # Multiple languages: separate KNN per partition, merge by distance.
127+ fetch_k = limit + offset
128+ rows = heapq .nsmallest (
129+ fetch_k ,
130+ (
131+ row
132+ for lang in languages
133+ for row in _knn_query (conn , embedding_bytes , fetch_k , lang )
134+ ),
135+ key = lambda r : r [5 ], # distance column
96136 )
97- )
98137
99- return results [offset : offset + limit ]
138+ if not paths :
139+ rows = rows [offset :]
140+
141+ return [
142+ QueryResult (
143+ file_path = file_path ,
144+ language = language ,
145+ content = content ,
146+ start_line = start_line ,
147+ end_line = end_line ,
148+ score = _l2_to_score (distance ),
149+ )
150+ for file_path , language , content , start_line , end_line , distance in rows
151+ ]
0 commit comments