Skip to content

Commit 7751f71

Browse files
author
Zhe Yu
committed
refactor(cli): Refactor query command to use DB adapter layer
1 parent b35b7c0 commit 7751f71

3 files changed

Lines changed: 117 additions & 64 deletions

File tree

src/vectorcode/database/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ async def count(self, what: ResultType = ResultType.chunk) -> int:
7373
@abstractmethod
7474
async def query(
7575
self,
76-
) -> Sequence[QueryResult]:
76+
) -> list[QueryResult]:
7777
pass
7878

7979
@abstractmethod

src/vectorcode/database/chroma0.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
_logger = logging.getLogger(name=__name__)
4545

4646

47-
def __convert_chroma_query_results(
47+
def _convert_chroma_query_results(
4848
chroma_result: QueryResult, queries: Sequence[str]
4949
) -> list[types.QueryResult]:
5050
"""Convert chromadb query result to in-house query results"""
@@ -302,7 +302,7 @@ async def query(self):
302302
n_results=query_count,
303303
where=query_filter,
304304
)
305-
return __convert_chroma_query_results(query_result, self._configs.query)
305+
return _convert_chroma_query_results(query_result, self._configs.query)
306306

307307
async def _create_or_get_collection(
308308
self, collection_path: str, allow_create: bool = False

src/vectorcode/subcommands/query/__init__.py

Lines changed: 114 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from chromadb import Where
77
from chromadb.api.models.AsyncCollection import AsyncCollection
88
from chromadb.api.types import IncludeEnum, QueryResult
9-
from chromadb.errors import InvalidCollectionException, InvalidDimensionException
109
from tree_sitter import Point
1110

1211
from vectorcode.chunking import Chunk, StringChunker
@@ -18,15 +17,11 @@
1817
expand_path,
1918
)
2019
from vectorcode.common import (
21-
ClientManager,
22-
get_collection,
2320
get_embedding_function,
24-
verify_ef,
2521
)
26-
from vectorcode.database import types
22+
from vectorcode.database import get_database_connector, types
2723
from vectorcode.database.base import DatabaseConnectorBase
2824
from vectorcode.subcommands.query.reranker import (
29-
RerankerError,
3025
get_reranker,
3126
)
3227

@@ -174,67 +169,125 @@ def make_output_path(path: str, absolute: bool) -> str:
174169
return structured_result
175170

176171

172+
def _prepare_formatted_result(
173+
reranked_results: list[str | Chunk],
174+
) -> list[dict[str, str | int]]:
175+
results: list[dict[str, str | int]] = []
176+
for res in reranked_results:
177+
if isinstance(res, str):
178+
if os.path.isfile(res):
179+
# path to a file
180+
with open(res) as fin:
181+
results.append({"path": res, "document": fin.read()})
182+
else: # pragma: nocover
183+
logger.warning(f"Skipping non-existent file: {res}")
184+
else:
185+
assert isinstance(res, Chunk)
186+
if res.start is None or res.end is None:
187+
logger.warning(
188+
"This chunk doesn't have line range metadata. Please try re-vectorising the project."
189+
)
190+
output_dict = {
191+
"path": res.path,
192+
"chunk": res.text,
193+
"end_line": res.end.row if res.end is not None else None,
194+
"chunk_id": res.id,
195+
}
196+
if res.start:
197+
output_dict["start_line"] = res.start.row
198+
if res.end:
199+
output_dict["end_line"] = res.end.row
200+
results.append(output_dict)
201+
return results
202+
203+
177204
async def get_reranked_results(
205+
config: Config,
178206
database: DatabaseConnectorBase,
179-
) -> list[types.QueryResult]:
180-
await database.query()
207+
) -> list[str | Chunk]:
208+
"""
209+
Return a list of paths or `Chunk`s ranked by similarity.
210+
"""
211+
reranker = get_reranker(config)
212+
reranked_results = await reranker.rerank(results=await database.query())
213+
return reranked_results
181214

182215

183216
async def query(configs: Config) -> int:
184-
if (
217+
if QueryInclude.path not in configs.include:
218+
configs.include.append(QueryInclude.path)
219+
assert not (
185220
QueryInclude.chunk in configs.include
186221
and QueryInclude.document in configs.include
187-
):
188-
logger.error(
189-
"Having both chunk and document in the output is not supported!",
190-
)
191-
return 1
192-
async with ClientManager().get_client(configs) as client:
193-
try:
194-
collection = await get_collection(client, configs, False)
195-
if not verify_ef(collection, configs):
196-
return 1
197-
except (ValueError, InvalidCollectionException) as e:
198-
logger.error(
199-
f"{e.__class__.__name__}: There's no existing collection for {configs.project_root}",
200-
)
201-
return 1
202-
except InvalidDimensionException as e:
203-
logger.error(
204-
f"{e.__class__.__name__}: The collection was embedded with a different embedding model.",
205-
)
206-
return 1
207-
except IndexError as e: # pragma: nocover
208-
logger.error(
209-
f"{e.__class__.__name__}: Failed to get the collection. Please check your config."
210-
)
211-
return 1
222+
), "`chunk` and `document` cannot be used at the same time for `--include`."
212223

213-
if not configs.pipe:
214-
print("Starting querying...")
224+
database = get_database_connector(configs)
225+
reranked_results = await get_reranked_results(configs, database)
226+
formatted_results = _prepare_formatted_result(reranked_results)
227+
if configs.pipe:
228+
print(json.dumps(formatted_results))
229+
else:
230+
for idx, result in enumerate(formatted_results):
231+
for include_item in configs.include:
232+
print(f"{include_item.to_header()}{result.get(include_item.value)}")
233+
if idx != len(formatted_results) - 1:
234+
print()
235+
return 0
215236

216-
if QueryInclude.chunk in configs.include:
217-
if len((await collection.get(where={"start": {"$gte": 0}}))["ids"]) == 0:
218-
logger.warning(
219-
"""
220-
This collection doesn't contain line range metadata. Falling back to `--include path document`.
221-
Please re-vectorise it to use `--include chunk`.""",
222-
)
223-
configs.include = [QueryInclude.path, QueryInclude.document]
224-
225-
try:
226-
structured_result = await build_query_results(collection, configs)
227-
except RerankerError as e: # pragma: nocover
228-
# error logs should be handled where they're raised
229-
logger.error(f"{e.__class__.__name__}")
230-
return 1
231-
232-
if configs.pipe:
233-
print(json.dumps(structured_result))
234-
else:
235-
for idx, result in enumerate(structured_result):
236-
for include_item in configs.include:
237-
print(f"{include_item.to_header()}{result.get(include_item.value)}")
238-
if idx != len(structured_result) - 1:
239-
print()
240-
return 0
237+
# if (
238+
# QueryInclude.chunk in configs.include
239+
# and QueryInclude.document in configs.include
240+
# ):
241+
# logger.error(
242+
# "Having both chunk and document in the output is not supported!",
243+
# )
244+
# return 1
245+
# async with ClientManager().get_client(configs) as client:
246+
# try:
247+
# collection = await get_collection(client, configs, False)
248+
# if not verify_ef(collection, configs):
249+
# return 1
250+
# except (ValueError, InvalidCollectionException) as e:
251+
# logger.error(
252+
# f"{e.__class__.__name__}: There's no existing collection for {configs.project_root}",
253+
# )
254+
# return 1
255+
# except InvalidDimensionException as e:
256+
# logger.error(
257+
# f"{e.__class__.__name__}: The collection was embedded with a different embedding model.",
258+
# )
259+
# return 1
260+
# except IndexError as e: # pragma: nocover
261+
# logger.error(
262+
# f"{e.__class__.__name__}: Failed to get the collection. Please check your config."
263+
# )
264+
# return 1
265+
#
266+
# if not configs.pipe:
267+
# print("Starting querying...")
268+
#
269+
# if QueryInclude.chunk in configs.include:
270+
# if len((await collection.get(where={"start": {"$gte": 0}}))["ids"]) == 0:
271+
# logger.warning(
272+
# """
273+
# This collection doesn't contain line range metadata. Falling back to `--include path document`.
274+
# Please re-vectorise it to use `--include chunk`.""",
275+
# )
276+
# configs.include = [QueryInclude.path, QueryInclude.document]
277+
#
278+
# try:
279+
# structured_result = await build_query_results(collection, configs)
280+
# except RerankerError as e: # pragma: nocover
281+
# # error logs should be handled where they're raised
282+
# logger.error(f"{e.__class__.__name__}")
283+
# return 1
284+
#
285+
# if configs.pipe:
286+
# print(json.dumps(structured_result))
287+
# else:
288+
# for idx, result in enumerate(structured_result):
289+
# for include_item in configs.include:
290+
# print(f"{include_item.to_header()}{result.get(include_item.value)}")
291+
# if idx != len(structured_result) - 1:
292+
# print()
293+
# return 0

0 commit comments

Comments
 (0)