|
6 | 6 | from chromadb import Where |
7 | 7 | from chromadb.api.models.AsyncCollection import AsyncCollection |
8 | 8 | from chromadb.api.types import IncludeEnum, QueryResult |
9 | | -from chromadb.errors import InvalidCollectionException, InvalidDimensionException |
10 | 9 | from tree_sitter import Point |
11 | 10 |
|
12 | 11 | from vectorcode.chunking import Chunk, StringChunker |
|
18 | 17 | expand_path, |
19 | 18 | ) |
20 | 19 | from vectorcode.common import ( |
21 | | - ClientManager, |
22 | | - get_collection, |
23 | 20 | get_embedding_function, |
24 | | - verify_ef, |
25 | 21 | ) |
26 | | -from vectorcode.database import types |
| 22 | +from vectorcode.database import get_database_connector, types |
27 | 23 | from vectorcode.database.base import DatabaseConnectorBase |
28 | 24 | from vectorcode.subcommands.query.reranker import ( |
29 | | - RerankerError, |
30 | 25 | get_reranker, |
31 | 26 | ) |
32 | 27 |
|
@@ -174,67 +169,125 @@ def make_output_path(path: str, absolute: bool) -> str: |
174 | 169 | return structured_result |
175 | 170 |
|
176 | 171 |
|
| 172 | +def _prepare_formatted_result( |
| 173 | + reranked_results: list[str | Chunk], |
| 174 | +) -> list[dict[str, str | int]]: |
| 175 | + results: list[dict[str, str | int]] = [] |
| 176 | + for res in reranked_results: |
| 177 | + if isinstance(res, str): |
| 178 | + if os.path.isfile(res): |
| 179 | + # path to a file |
| 180 | + with open(res) as fin: |
| 181 | + results.append({"path": res, "document": fin.read()}) |
| 182 | + else: # pragma: nocover |
| 183 | + logger.warning(f"Skipping non-existent file: {res}") |
| 184 | + else: |
| 185 | + assert isinstance(res, Chunk) |
| 186 | + if res.start is None or res.end is None: |
| 187 | + logger.warning( |
| 188 | + "This chunk doesn't have line range metadata. Please try re-vectorising the project." |
| 189 | + ) |
| 190 | + output_dict = { |
| 191 | + "path": res.path, |
| 192 | + "chunk": res.text, |
| 193 | + "end_line": res.end.row if res.end is not None else None, |
| 194 | + "chunk_id": res.id, |
| 195 | + } |
| 196 | + if res.start: |
| 197 | + output_dict["start_line"] = res.start.row |
| 198 | + if res.end: |
| 199 | + output_dict["end_line"] = res.end.row |
| 200 | + results.append(output_dict) |
| 201 | + return results |
| 202 | + |
| 203 | + |
177 | 204 | async def get_reranked_results( |
| 205 | + config: Config, |
178 | 206 | database: DatabaseConnectorBase, |
179 | | -) -> list[types.QueryResult]: |
180 | | - await database.query() |
| 207 | +) -> list[str | Chunk]: |
| 208 | + """ |
| 209 | + Return a list of paths or `Chunk`s ranked by similarity. |
| 210 | + """ |
| 211 | + reranker = get_reranker(config) |
| 212 | + reranked_results = await reranker.rerank(results=await database.query()) |
| 213 | + return reranked_results |
181 | 214 |
|
182 | 215 |
|
183 | 216 | async def query(configs: Config) -> int: |
184 | | - if ( |
| 217 | + if QueryInclude.path not in configs.include: |
| 218 | + configs.include.append(QueryInclude.path) |
| 219 | + assert not ( |
185 | 220 | QueryInclude.chunk in configs.include |
186 | 221 | and QueryInclude.document in configs.include |
187 | | - ): |
188 | | - logger.error( |
189 | | - "Having both chunk and document in the output is not supported!", |
190 | | - ) |
191 | | - return 1 |
192 | | - async with ClientManager().get_client(configs) as client: |
193 | | - try: |
194 | | - collection = await get_collection(client, configs, False) |
195 | | - if not verify_ef(collection, configs): |
196 | | - return 1 |
197 | | - except (ValueError, InvalidCollectionException) as e: |
198 | | - logger.error( |
199 | | - f"{e.__class__.__name__}: There's no existing collection for {configs.project_root}", |
200 | | - ) |
201 | | - return 1 |
202 | | - except InvalidDimensionException as e: |
203 | | - logger.error( |
204 | | - f"{e.__class__.__name__}: The collection was embedded with a different embedding model.", |
205 | | - ) |
206 | | - return 1 |
207 | | - except IndexError as e: # pragma: nocover |
208 | | - logger.error( |
209 | | - f"{e.__class__.__name__}: Failed to get the collection. Please check your config." |
210 | | - ) |
211 | | - return 1 |
| 222 | + ), "`chunk` and `document` cannot be used at the same time for `--include`." |
212 | 223 |
|
213 | | - if not configs.pipe: |
214 | | - print("Starting querying...") |
| 224 | + database = get_database_connector(configs) |
| 225 | + reranked_results = await get_reranked_results(configs, database) |
| 226 | + formatted_results = _prepare_formatted_result(reranked_results) |
| 227 | + if configs.pipe: |
| 228 | + print(json.dumps(formatted_results)) |
| 229 | + else: |
| 230 | + for idx, result in enumerate(formatted_results): |
| 231 | + for include_item in configs.include: |
| 232 | + print(f"{include_item.to_header()}{result.get(include_item.value)}") |
| 233 | + if idx != len(formatted_results) - 1: |
| 234 | + print() |
| 235 | + return 0 |
215 | 236 |
|
216 | | - if QueryInclude.chunk in configs.include: |
217 | | - if len((await collection.get(where={"start": {"$gte": 0}}))["ids"]) == 0: |
218 | | - logger.warning( |
219 | | - """ |
220 | | - This collection doesn't contain line range metadata. Falling back to `--include path document`. |
221 | | - Please re-vectorise it to use `--include chunk`.""", |
222 | | - ) |
223 | | - configs.include = [QueryInclude.path, QueryInclude.document] |
224 | | - |
225 | | - try: |
226 | | - structured_result = await build_query_results(collection, configs) |
227 | | - except RerankerError as e: # pragma: nocover |
228 | | - # error logs should be handled where they're raised |
229 | | - logger.error(f"{e.__class__.__name__}") |
230 | | - return 1 |
231 | | - |
232 | | - if configs.pipe: |
233 | | - print(json.dumps(structured_result)) |
234 | | - else: |
235 | | - for idx, result in enumerate(structured_result): |
236 | | - for include_item in configs.include: |
237 | | - print(f"{include_item.to_header()}{result.get(include_item.value)}") |
238 | | - if idx != len(structured_result) - 1: |
239 | | - print() |
240 | | - return 0 |
| 237 | + # if ( |
| 238 | + # QueryInclude.chunk in configs.include |
| 239 | + # and QueryInclude.document in configs.include |
| 240 | + # ): |
| 241 | + # logger.error( |
| 242 | + # "Having both chunk and document in the output is not supported!", |
| 243 | + # ) |
| 244 | + # return 1 |
| 245 | + # async with ClientManager().get_client(configs) as client: |
| 246 | + # try: |
| 247 | + # collection = await get_collection(client, configs, False) |
| 248 | + # if not verify_ef(collection, configs): |
| 249 | + # return 1 |
| 250 | + # except (ValueError, InvalidCollectionException) as e: |
| 251 | + # logger.error( |
| 252 | + # f"{e.__class__.__name__}: There's no existing collection for {configs.project_root}", |
| 253 | + # ) |
| 254 | + # return 1 |
| 255 | + # except InvalidDimensionException as e: |
| 256 | + # logger.error( |
| 257 | + # f"{e.__class__.__name__}: The collection was embedded with a different embedding model.", |
| 258 | + # ) |
| 259 | + # return 1 |
| 260 | + # except IndexError as e: # pragma: nocover |
| 261 | + # logger.error( |
| 262 | + # f"{e.__class__.__name__}: Failed to get the collection. Please check your config." |
| 263 | + # ) |
| 264 | + # return 1 |
| 265 | + # |
| 266 | + # if not configs.pipe: |
| 267 | + # print("Starting querying...") |
| 268 | + # |
| 269 | + # if QueryInclude.chunk in configs.include: |
| 270 | + # if len((await collection.get(where={"start": {"$gte": 0}}))["ids"]) == 0: |
| 271 | + # logger.warning( |
| 272 | + # """ |
| 273 | + # This collection doesn't contain line range metadata. Falling back to `--include path document`. |
| 274 | + # Please re-vectorise it to use `--include chunk`.""", |
| 275 | + # ) |
| 276 | + # configs.include = [QueryInclude.path, QueryInclude.document] |
| 277 | + # |
| 278 | + # try: |
| 279 | + # structured_result = await build_query_results(collection, configs) |
| 280 | + # except RerankerError as e: # pragma: nocover |
| 281 | + # # error logs should be handled where they're raised |
| 282 | + # logger.error(f"{e.__class__.__name__}") |
| 283 | + # return 1 |
| 284 | + # |
| 285 | + # if configs.pipe: |
| 286 | + # print(json.dumps(structured_result)) |
| 287 | + # else: |
| 288 | + # for idx, result in enumerate(structured_result): |
| 289 | + # for include_item in configs.include: |
| 290 | + # print(f"{include_item.to_header()}{result.get(include_item.value)}") |
| 291 | + # if idx != len(structured_result) - 1: |
| 292 | + # print() |
| 293 | + # return 0 |
0 commit comments