Skip to content

Commit df8929c

Browse files
committed
chore: minor tunes
1 parent 6600302 commit df8929c

4 files changed

Lines changed: 119 additions & 62 deletions

File tree

src/cocoindex_code/embedder.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ def embed(self, texts: list[str]) -> list[NDArray[np.float32]]:
8181
texts,
8282
convert_to_numpy=True,
8383
normalize_embeddings=self._normalize_embeddings,
84+
show_progress_bar=False,
8485
) # type: ignore[assignment]
8586
return list(embeddings)
8687

@@ -93,6 +94,7 @@ def embed_query(self, texts: list[str]) -> list[NDArray[np.float32]]:
9394
prompt_name=self._query_prompt_name,
9495
convert_to_numpy=True,
9596
normalize_embeddings=self._normalize_embeddings,
97+
show_progress_bar=False,
9698
) # type: ignore[assignment]
9799
return list(embeddings)
98100

src/cocoindex_code/indexer.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,9 @@
5656
]
5757

5858
# Chunking configuration
59-
CHUNK_SIZE = 4000
60-
MIN_CHUNK_SIZE = 500
61-
CHUNK_OVERLAP = 400
59+
CHUNK_SIZE = 2000
60+
MIN_CHUNK_SIZE = 300
61+
CHUNK_OVERLAP = 200
6262

6363
# Chunking splitter (stateless, can be module-level)
6464
splitter = RecursiveSplitter()
@@ -125,7 +125,8 @@ async def app_main() -> None:
125125
primary_key=["id"],
126126
),
127127
virtual_table_def=Vec0TableDef(
128-
auxiliary_columns=["file_path", "language", "content", "start_line", "end_line"],
128+
partition_key_columns=["language"],
129+
auxiliary_columns=["file_path", "content", "start_line", "end_line"],
129130
),
130131
)
131132

src/cocoindex_code/query.py

Lines changed: 109 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,85 @@
11
"""Query implementation for codebase search."""
22

3-
import fnmatch
3+
import heapq
4+
import sqlite3
5+
from typing import Any
46

57
import cocoindex as coco
68

79
from .config import config
810
from .schema import QueryResult
911
from .shared import SQLITE_DB, embedder
1012

11-
# Over-fetch multiplier when post-filtering is needed
12-
_FILTER_OVERFETCH = 5
13-
_FILTER_MIN_K = 200
13+
14+
def _l2_to_score(distance: float) -> float:
15+
"""Convert L2 distance to cosine similarity (exact for unit vectors)."""
16+
return 1.0 - distance * distance / 2.0
17+
18+
19+
def _knn_query(
20+
conn: sqlite3.Connection,
21+
embedding_bytes: bytes,
22+
k: int,
23+
language: str | None = None,
24+
) -> list[tuple[Any, ...]]:
25+
"""Run a vec0 KNN query, optionally constrained to a language partition."""
26+
if language is not None:
27+
return conn.execute(
28+
"""
29+
SELECT file_path, language, content, start_line, end_line, distance
30+
FROM code_chunks_vec
31+
WHERE embedding MATCH ? AND k = ? AND language = ?
32+
ORDER BY distance
33+
""",
34+
(embedding_bytes, k, language),
35+
).fetchall()
36+
return conn.execute(
37+
"""
38+
SELECT file_path, language, content, start_line, end_line, distance
39+
FROM code_chunks_vec
40+
WHERE embedding MATCH ? AND k = ?
41+
ORDER BY distance
42+
""",
43+
(embedding_bytes, k),
44+
).fetchall()
45+
46+
47+
def _full_scan_query(
48+
conn: sqlite3.Connection,
49+
embedding_bytes: bytes,
50+
limit: int,
51+
offset: int,
52+
languages: list[str] | None = None,
53+
paths: list[str] | None = None,
54+
) -> list[tuple[Any, ...]]:
55+
"""Full scan with SQL-level distance computation and filtering."""
56+
conditions: list[str] = []
57+
params: list[Any] = [embedding_bytes]
58+
59+
if languages:
60+
placeholders = ",".join("?" for _ in languages)
61+
conditions.append(f"language IN ({placeholders})")
62+
params.extend(languages)
63+
64+
if paths:
65+
path_clauses = " OR ".join("file_path GLOB ?" for _ in paths)
66+
conditions.append(f"({path_clauses})")
67+
params.extend(paths)
68+
69+
where = f"WHERE {' AND '.join(conditions)}" if conditions else ""
70+
params.extend([limit, offset])
71+
72+
return conn.execute(
73+
f"""
74+
SELECT file_path, language, content, start_line, end_line,
75+
vec_distance_L2(embedding, ?) as distance
76+
FROM code_chunks_vec
77+
{where}
78+
ORDER BY distance
79+
LIMIT ? OFFSET ?
80+
""",
81+
params,
82+
).fetchall()
1483

1584

1685
async def query_codebase(
@@ -24,76 +93,59 @@ async def query_codebase(
2493
Perform vector similarity search using vec0 KNN index.
2594
2695
Uses sqlite-vec's vec0 virtual table for indexed nearest-neighbor search.
27-
Optionally filters by language(s) and/or file path glob pattern(s).
96+
Language filtering uses vec0 partition keys for exact index-level filtering.
97+
Path filtering triggers a full scan with distance computation.
2898
"""
2999
if not config.target_sqlite_db_path.exists():
30100
raise RuntimeError(
31101
f"Index database not found at {config.target_sqlite_db_path}. "
32102
"Please run a query with refresh_index=True first."
33103
)
34104

35-
# Get the database connection from CocoIndex environment
36105
coco_env = await coco.default_env()
37106
db = coco_env.get_context(SQLITE_DB)
38107

39-
# Generate query embedding — use embed_query if available (supports asymmetric
40-
# prompting for models like nomic-embed-code that use different prefixes for
41-
# queries vs indexed documents).
108+
# Generate query embedding.
42109
if hasattr(embedder, "embed_query"):
43110
query_embedding = await embedder.embed_query(query)
44111
else:
45112
query_embedding = await embedder.embed(query)
46113

47-
# Convert to bytes for sqlite-vec (float32)
48114
embedding_bytes = query_embedding.astype("float32").tobytes()
49115

50-
# vec0 KNN queries don't support arbitrary WHERE/OFFSET, so we
51-
# over-fetch when post-filtering is needed and apply filters in Python.
52-
needs_post_filter = bool(languages or paths)
53-
if needs_post_filter:
54-
fetch_k = max((limit + offset) * _FILTER_OVERFETCH, _FILTER_MIN_K)
55-
else:
56-
fetch_k = limit + offset
57-
58-
# Query using vec0 KNN index with readonly transaction.
59-
# vec0 returns L2 distance; for normalized embeddings the ranking is
60-
# identical to cosine distance. Convert to cosine similarity via
61-
# cos_sim = 1 - L2² / 2 (exact for unit vectors).
62116
with db.value.readonly() as conn:
63-
cursor = conn.execute(
64-
"""
65-
SELECT
66-
file_path,
67-
language,
68-
content,
69-
start_line,
70-
end_line,
71-
distance
72-
FROM code_chunks_vec
73-
WHERE embedding MATCH ? AND k = ?
74-
ORDER BY distance
75-
""",
76-
(embedding_bytes, fetch_k),
77-
)
78-
rows = cursor.fetchall()
79-
80-
language_set = set(languages) if languages else None
81-
results: list[QueryResult] = []
82-
83-
for file_path, language, content, start_line, end_line, distance in rows:
84-
if language_set and language not in language_set:
85-
continue
86-
if paths and not any(fnmatch.fnmatch(file_path, p) for p in paths):
87-
continue
88-
results.append(
89-
QueryResult(
90-
file_path=file_path,
91-
language=language,
92-
content=content,
93-
start_line=start_line,
94-
end_line=end_line,
95-
score=1.0 - distance * distance / 2.0,
117+
if paths:
118+
# Path filter → full scan (vec0 can't filter on auxiliary columns).
119+
# LIMIT/OFFSET handled in SQL.
120+
rows = _full_scan_query(conn, embedding_bytes, limit, offset, languages, paths)
121+
elif not languages or len(languages) == 1:
122+
# Single language or no filter: one KNN query.
123+
lang = languages[0] if languages else None
124+
rows = _knn_query(conn, embedding_bytes, limit + offset, lang)
125+
else:
126+
# Multiple languages: separate KNN per partition, merge by distance.
127+
fetch_k = limit + offset
128+
rows = heapq.nsmallest(
129+
fetch_k,
130+
(
131+
row
132+
for lang in languages
133+
for row in _knn_query(conn, embedding_bytes, fetch_k, lang)
134+
),
135+
key=lambda r: r[5], # distance column
96136
)
97-
)
98137

99-
return results[offset : offset + limit]
138+
if not paths:
139+
rows = rows[offset:]
140+
141+
return [
142+
QueryResult(
143+
file_path=file_path,
144+
language=language,
145+
content=content,
146+
start_line=start_line,
147+
end_line=end_line,
148+
score=_l2_to_score(distance),
149+
)
150+
for file_path, language, content, start_line, end_line, distance in rows
151+
]

src/cocoindex_code/server.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ class SearchResultModel(BaseModel):
7777
" or code snippets."
7878
" Returns matching code chunks with file paths,"
7979
" line numbers, and relevance scores."
80+
" Start with a small limit (e.g., 5);"
81+
" if most results look relevant, use offset to paginate for more."
8082
),
8183
)
8284
async def search(
@@ -90,7 +92,7 @@ async def search(
9092
)
9193
),
9294
limit: int = Field(
93-
default=10,
95+
default=5,
9496
ge=1,
9597
le=100,
9698
description="Maximum number of results to return (1-100)",

0 commit comments

Comments
 (0)