refactor(cli): make function signatures more consistent.

Zhe Yu · Zhe Yu · commit c84ff39ed771 · 2025-08-29T13:47:17.000+08:00
diff --git a/src/vectorcode/chunking.py b/src/vectorcode/chunking.py
@@ -27,12 +27,14 @@ class Chunk:
     text: str
     start: Point | None = None
     end: Point | None = None
+    path: str | None = None
+    id: str | None = None
 
     def __str__(self):
         return self.text
 
     def __hash__(self) -> int:
-        return hash(f"VectorCodeChunk({self.start}:{self.end}@{self.text})")
+        return hash(f"VectorCodeChunk_{self.path}({self.start}:{self.end}@{self.text})")
 
     def export_dict(self):
         d: dict[str, str | dict[str, int]] = {"text": self.text}
@@ -48,6 +50,10 @@ def export_dict(self):
                     "end": {"row": self.end.row, "column": self.end.column},
                 }
             )
+        if self.path is not None:
+            d["path"] = self.path
+        if self.id:
+            d["chunk_id"] = self.id
         return d
 
 
diff --git a/src/vectorcode/subcommands/query/__init__.py b/src/vectorcode/subcommands/query/__init__.py
@@ -3,7 +3,7 @@
 import os
 from typing import Any, cast
 
-from chromadb import GetResult, Where
+from chromadb import Where
 from chromadb.api.models.AsyncCollection import AsyncCollection
 from chromadb.api.types import IncludeEnum, QueryResult
 from chromadb.errors import InvalidCollectionException, InvalidDimensionException
@@ -39,19 +39,23 @@ def convert_query_results(
     assert chroma_result["documents"] is not None
     assert chroma_result["distances"] is not None
     assert chroma_result["metadatas"] is not None
+    assert chroma_result["ids"] is not None
 
     chroma_results_list: list[vectorcode_types.QueryResult] = []
     for q_i in range(len(queries)):
         q = queries[q_i]
         documents = chroma_result["documents"][q_i]
         distances = chroma_result["distances"][q_i]
         metadatas = chroma_result["metadatas"][q_i]
-        for doc, dist, meta in zip(documents, distances, metadatas):
-            chunk = Chunk(text=doc)
+        ids = chroma_result["ids"][q_i]
+        for doc, dist, meta, _id in zip(documents, distances, metadatas, ids):
+            chunk = Chunk(text=doc, id=_id)
             if meta.get("start"):
                 chunk.start = Point(int(meta.get("start", 0)), 0)
             if meta.get("end"):
-                chunk.end = Point(int(meta.get("end", 0)) + 1, 0)
+                chunk.end = Point(int(meta.get("end", 0)), 0)
+            if meta.get("path"):
+                chunk.path = str(meta["path"])
             chroma_results_list.append(
                 vectorcode_types.QueryResult(
                     chunk=chunk,
@@ -65,7 +69,7 @@ def convert_query_results(
 
 async def get_query_result_files(
     collection: AsyncCollection, configs: Config
-) -> list[str]:
+) -> list[str | Chunk]:
     query_chunks = []
     assert configs.query, "Query messages cannot be empty."
     chunker = StringChunker(configs)
@@ -126,63 +130,43 @@ async def get_query_result_files(
 async def build_query_results(
     collection: AsyncCollection, configs: Config
 ) -> list[dict[str, str | int]]:
-    structured_result = []
-    for identifier in await get_query_result_files(collection, configs):
-        if os.path.isfile(identifier):
-            if configs.use_absolute_path:
-                output_path = os.path.abspath(identifier)
-            else:
-                output_path = os.path.relpath(identifier, configs.project_root)
-            full_result = {"path": output_path}
-            with open(identifier) as fin:
-                document = fin.read()
-                full_result["document"] = document
+    assert configs.project_root
 
-            structured_result.append(
-                {str(key): full_result[str(key)] for key in configs.include}
-            )
-        elif QueryInclude.chunk in configs.include:
-            chunks: GetResult = await collection.get(
-                identifier, include=[IncludeEnum.metadatas, IncludeEnum.documents]
-            )
-            meta = chunks.get(
-                "metadatas",
-            )
-            if meta is not None and len(meta) != 0:
-                chunk_texts = chunks.get("documents")
-                assert chunk_texts is not None, (
-                    "QueryResult does not contain `documents`!"
-                )
-                full_result: dict[str, str | int] = {
-                    "chunk": str(chunk_texts[0]),
-                    "chunk_id": identifier,
-                }
-                if meta[0].get("start") is not None and meta[0].get("end") is not None:
-                    path = str(meta[0].get("path"))
-                    with open(path) as fin:
-                        start: int = int(meta[0]["start"])
-                        end: int = int(meta[0]["end"])
-                        full_result["chunk"] = "".join(fin.readlines()[start : end + 1])
-                    full_result["start_line"] = start
-                    full_result["end_line"] = end
-                    if QueryInclude.path in configs.include:
-                        full_result["path"] = str(
-                            meta[0]["path"]
-                            if configs.use_absolute_path
-                            else os.path.relpath(
-                                str(meta[0]["path"]), str(configs.project_root)
-                            )
-                        )
-
-                    structured_result.append(full_result)
-            else:  # pragma: nocover
-                logger.error(
-                    "This collection doesn't support chunk-mode output because it lacks the necessary metadata. Please re-vectorise it.",
-                )
+    def make_output_path(path: str, absolute: bool) -> str:
+        if absolute:
+            if os.path.isabs(path):
+                return path
+            return os.path.abspath(os.path.join(str(configs.project_root), path))
+        else:
+            rel_path = os.path.relpath(path, configs.project_root)
+            if isinstance(rel_path, bytes):  # pragma: nocover
+                # for some reasons some python versions report that `os.path.relpath` returns a string.
+                rel_path = rel_path.decode()
+            return rel_path
 
+    structured_result = []
+    for res in await get_query_result_files(collection, configs):
+        if isinstance(res, str):
+            output_path = make_output_path(res, configs.use_absolute_path)
+            io_path = make_output_path(res, True)
+            if not os.path.isfile(io_path):
+                logger.warning(f"{io_path} is no longer a valid file.")
+                continue
+            with open(io_path) as fin:
+                structured_result.append({"path": output_path, "document": fin.read()})
         else:
-            logger.warning(
-                f"{identifier} is no longer a valid file! Please re-run vectorcode vectorise to refresh the database.",
+            res = cast(Chunk, res)
+            assert res.path, f"{res} has no `path` attribute."
+            structured_result.append(
+                {
+                    "path": make_output_path(res.path, configs.use_absolute_path)
+                    if res.path is not None
+                    else None,
+                    "chunk": res.text,
+                    "start_line": res.start.row if res.start is not None else None,
+                    "end_line": res.end.row if res.end is not None else None,
+                    "chunk_id": res.id,
+                }
             )
     for result in structured_result:
         if result.get("path") is not None:
diff --git a/src/vectorcode/subcommands/query/reranker/base.py b/src/vectorcode/subcommands/query/reranker/base.py
@@ -50,28 +50,33 @@ async def compute_similarity(
         self, results: list[QueryResult]
     ) -> None:  # pragma: nocover
         """
-        Modify the `QueryResult.scores` field IN-PLACE so that they contain the correct scores.
+        Modify the `QueryResult.scores` field **IN-PLACE** so that they contain the correct scores.
         """
         raise NotImplementedError
 
-    async def rerank(self, results: list[QueryResult]) -> list[str]:
+    async def rerank(self, results: list[QueryResult]) -> list[str | Chunk]:
         if len(results) == 0:
             return []
+
+        # compute the similarity scores
         await self.compute_similarity(results)
 
+        # group the results by the query type: file (path) or chunk
+        # and only keep the `top_k` results for each group
         group_by = "path"
         if QueryInclude.chunk in self.configs.include:
             group_by = "chunk"
         grouped_results = QueryResult.group(*results, by=group_by, top_k="auto")
 
+        # compute the mean scores for each of the groups
         scores: dict[Chunk | str, float] = {}
         for key in grouped_results.keys():
             scores[key] = float(
                 numpy.mean(tuple(i.mean_score() for i in grouped_results[key]))
             )
 
         return list(
-            str(i)
+            i
             for i in heapq.nlargest(
                 self.configs.n_result, grouped_results.keys(), key=lambda x: scores[x]
             )
diff --git a/src/vectorcode/subcommands/query/types.py b/src/vectorcode/subcommands/query/types.py
@@ -86,8 +86,9 @@ def __gt__(self, other: "QueryResult"):
         return self.mean_score() > other.mean_score()
 
     def __eq__(self, other: object, /) -> bool:
-        assert isinstance(other, QueryResult)
-        return self.mean_score() == other.mean_score()
+        return (
+            isinstance(other, QueryResult) and self.mean_score() == other.mean_score()
+        )
 
     def is_same_doc(self, other: "QueryResult") -> bool:
         return self.path == other.path and self.chunk == other.chunk
diff --git a/tests/subcommands/query/test_query.py b/tests/subcommands/query/test_query.py
@@ -1,7 +1,7 @@
-from unittest.mock import AsyncMock, MagicMock, mock_open, patch
+from unittest.mock import AsyncMock, MagicMock, patch
 
 import pytest
-from chromadb import GetResult
+from chromadb import QueryResult
 from chromadb.api.models.AsyncCollection import AsyncCollection
 from chromadb.api.types import IncludeEnum
 from chromadb.errors import InvalidCollectionException, InvalidDimensionException
@@ -136,43 +136,34 @@ async def test_build_query_results_chunk_mode_success(mock_collection, mock_conf
     mock_config.include = [QueryInclude.chunk, QueryInclude.path]
     mock_config.project_root = "/test/project"
     mock_config.use_absolute_path = False
-    identifier = "chunk_id_1"
+    mock_config.query = ["dummy_query"]
+    identifier = "chunk_id"
     file_path = "/test/project/subdir/file1.py"
     relative_path = "subdir/file1.py"
     start_line = 5
     end_line = 10
 
     full_file_content_lines = [f"line {i}\n" for i in range(15)]
-    full_file_content = "".join(full_file_content_lines)
 
     expected_chunk_content = "".join(full_file_content_lines[start_line : end_line + 1])
 
-    mock_get_result = GetResult(
-        ids=[identifier],
-        embeddings=None,
-        documents=["original chunk doc in db"],
-        metadatas=[{"path": file_path, "start": start_line, "end": end_line}],
+    mock_get_result = QueryResult(
+        ids=[[identifier]],
+        documents=[[expected_chunk_content]],
+        metadatas=[[{"path": file_path, "start": start_line, "end": end_line}]],
+        distances=[[0.2]],
     )
-
+    mock_collection.query = AsyncMock(return_value=mock_get_result)
     with (
         patch(
             "vectorcode.subcommands.query.get_query_result_files",
-            return_value=[identifier],
+            return_value=await get_query_result_files(mock_collection, mock_config),
         ),
         patch("os.path.isfile", return_value=False),
-        patch("builtins.open", mock_open(read_data=full_file_content)) as mocked_open,
         patch("os.path.relpath", return_value=relative_path) as mock_relpath,
     ):
-        mock_collection.get = AsyncMock(return_value=mock_get_result)
-
         results = await build_query_results(mock_collection, mock_config)
 
-        mock_collection.get.assert_called_once_with(
-            identifier, include=[IncludeEnum.metadatas, IncludeEnum.documents]
-        )
-
-        mocked_open.assert_called_once_with(file_path)
-
         mock_relpath.assert_called_once_with(file_path, str(mock_config.project_root))
 
         assert len(results) == 1
diff --git a/tests/subcommands/query/test_reranker.py b/tests/subcommands/query/test_reranker.py
@@ -134,15 +134,15 @@ async def test_naive_reranker_rerank_chunks(naive_reranker_conf, query_result):
     """Test basic reranking functionality of NaiveReranker"""
     naive_reranker_conf.include = [QueryInclude.chunk]
     reranker = NaiveReranker(naive_reranker_conf)
-    chunk_text = {str(i.chunk) for i in query_result}
+    chunks = {i.chunk for i in query_result}
     result = await reranker.rerank(query_result)
 
     # Check the result is a list of paths with correct length
     assert isinstance(result, list)
     assert len(result) <= naive_reranker_conf.n_result
 
     for res in result:
-        assert res in chunk_text
+        assert res in chunks
 
 
 @pytest.mark.asyncio