simplify the process of building a new reranker by creating a compute_similarity method

Davidyz · Davidyz · commit 498ec618b26d · 2025-04-21T11:40:03.000+08:00
diff --git a/src/vectorcode/subcommands/query/reranker/base.py b/src/vectorcode/subcommands/query/reranker/base.py
@@ -1,18 +1,22 @@
+import heapq
 import logging
 from abc import ABC, abstractmethod
-from typing import Any
+from collections import defaultdict
+from typing import Any, DefaultDict, Optional, Sequence, cast
 
+import numpy
 from chromadb.api.types import QueryResult
 
-from vectorcode.cli_utils import Config
+from vectorcode.cli_utils import Config, QueryInclude
 
 logger = logging.getLogger(name=__name__)
 
 
 class RerankerBase(ABC):
     """This is the base class for the rerankers.
     You should use the configs.reranker_params field to store and pass the parameters used for your reranker.
-    You should implement the `rerank` method, which returns a list of chunk IDs if QueryInclude.chunk is in configs.include, or a list of paths otherwise.
+    You should implement the `compute_similarity` method, which will be called by `rerank` to compute
+    similarity scores between search query and results.
     The items in the returned list should be sorted such that the relevance decreases along the list.
 
     The class doc string will be added to the error message if your reranker fails to initialise.
@@ -21,7 +25,11 @@ class RerankerBase(ABC):
 
     def __init__(self, configs: Config, **kwargs: Any):
         self.configs = configs
+        assert self.configs.query is not None, (
+            "'configs' should contain the query messages."
+        )
         self.n_result = configs.n_result
+        self._raw_results: Optional[QueryResult] = None
 
     @classmethod
     def create(cls, configs: Config, **kwargs: Any):
@@ -38,5 +46,48 @@ def create(cls, configs: Config, **kwargs: Any):
             raise
 
     @abstractmethod
-    def rerank(self, results: QueryResult) -> list[str]:  # pragma: nocover
+    def compute_similarity(
+        self, results: list[str], query_message: str
+    ) -> Sequence[float]:  # pragma: nocover
+        """Given a list of n results and 1 query message,
+        return a list-like object of length n that contains the similarity scores between
+        each item in `results` and the `query_message`.
+
+        A high similarity score means the strings are semantically similar to each other.
+        `query_message` will be loaded in the same order as they appear in `self.configs.query`.
+
+        If you need the raw query results from chromadb,
+        it'll be saved in `self._raw_results` before this method is called.
+        """
         raise NotImplementedError
+
+    def rerank(self, results: QueryResult | dict) -> list[str]:
+        self._raw_results = cast(QueryResult, results)
+        query_chunks = self.configs.query
+        assert query_chunks
+        assert results["metadatas"] is not None
+        assert results["documents"] is not None
+        documents: DefaultDict[str, list[float]] = defaultdict(list)
+        for query_chunk_idx in range(len(query_chunks)):
+            chunk_ids = results["ids"][query_chunk_idx]
+            chunk_metas = results["metadatas"][query_chunk_idx]
+            chunk_docs = results["documents"][query_chunk_idx]
+            scores = self.compute_similarity(chunk_docs, query_chunks[query_chunk_idx])
+            for i, score in enumerate(scores):
+                if QueryInclude.chunk in self.configs.include:
+                    documents[chunk_ids[i]].append(float(score))
+                else:
+                    documents[str(chunk_metas[i]["path"])].append(float(score))
+
+        logger.debug("Document scores: %s", documents)
+        top_k = int(numpy.mean(tuple(len(i) for i in documents.values())))
+        for key in documents.keys():
+            documents[key] = heapq.nlargest(top_k, documents[key])
+
+        self._raw_results = None
+
+        return heapq.nlargest(
+            self.n_result,
+            documents.keys(),
+            key=lambda x: float(numpy.mean(documents[x])),
+        )
diff --git a/src/vectorcode/subcommands/query/reranker/cross_encoder.py b/src/vectorcode/subcommands/query/reranker/cross_encoder.py
@@ -1,11 +1,7 @@
-import heapq
 import logging
-from collections import defaultdict
-from typing import Any, DefaultDict
+from typing import Any
 
-import numpy
-
-from vectorcode.cli_utils import Config, QueryInclude
+from vectorcode.cli_utils import Config
 
 from .base import RerankerBase
 
@@ -25,9 +21,6 @@ def __init__(
         **kwargs: Any,
     ):
         super().__init__(configs)
-        assert self.configs.query is not None, (
-            "'configs' should contain the query messages."
-        )
         from sentence_transformers import CrossEncoder
 
         if configs.reranker_params.get("model_name_or_path") is None:
@@ -39,33 +32,8 @@ def __init__(
             )
         self.model = CrossEncoder(**configs.reranker_params)
 
-    def rerank(self, results) -> list[str]:
-        assert self.configs.query
-        query_chunks = self.configs.query
-        assert results["metadatas"] is not None
-        assert results["documents"] is not None
-        documents: DefaultDict[str, list[float]] = defaultdict(list)
-        for query_chunk_idx in range(len(query_chunks)):
-            chunk_ids = results["ids"][query_chunk_idx]
-            chunk_metas = results["metadatas"][query_chunk_idx]
-            chunk_docs = results["documents"][query_chunk_idx]
-            ranks = self.model.rank(
-                query_chunks[query_chunk_idx], chunk_docs, apply_softmax=True
-            )
-            for rank in ranks:
-                if QueryInclude.chunk in self.configs.include:
-                    documents[chunk_ids[rank["corpus_id"]]].append(float(rank["score"]))
-                else:
-                    documents[chunk_metas[rank["corpus_id"]]["path"]].append(
-                        float(rank["score"])
-                    )
-        logger.debug("Document scores: %s", documents)
-        top_k = int(numpy.mean(tuple(len(i) for i in documents.values())))
-        for key in documents.keys():
-            documents[key] = heapq.nlargest(top_k, documents[key])
-
-        return heapq.nlargest(
-            self.n_result,
-            documents.keys(),
-            key=lambda x: float(numpy.mean(documents[x])),
+    def compute_similarity(self, results: list[str], query_message: str):
+        return list(
+            float(i)
+            for i in self.model.predict([(chunk, query_message) for chunk in results])
         )
diff --git a/src/vectorcode/subcommands/query/reranker/naive.py b/src/vectorcode/subcommands/query/reranker/naive.py
@@ -1,11 +1,7 @@
-import heapq
 import logging
-from collections import defaultdict
-from typing import Any, DefaultDict
+from typing import Any, Sequence
 
-import numpy
-
-from vectorcode.cli_utils import Config, QueryInclude
+from vectorcode.cli_utils import Config
 
 from .base import RerankerBase
 
@@ -21,30 +17,11 @@ class NaiveReranker(RerankerBase):
     def __init__(self, configs: Config, **kwargs: Any):
         super().__init__(configs)
 
-    def rerank(self, results) -> list[str]:
-        assert results["metadatas"] is not None
-        assert results["distances"] is not None
-        documents: DefaultDict[str, list[float]] = defaultdict(list)
-        for query_chunk_idx in range(len(results["ids"])):
-            chunk_ids = results["ids"][query_chunk_idx]
-            chunk_metas = results["metadatas"][query_chunk_idx]
-            chunk_distances = results["distances"][query_chunk_idx]
-            # NOTE: distances, smaller is better.
-            paths = [str(meta["path"]) for meta in chunk_metas]
-            assert len(paths) == len(chunk_distances)
-            for distance, identifier in zip(
-                chunk_distances,
-                chunk_ids if QueryInclude.chunk in self.configs.include else paths,
-            ):
-                if identifier is None:  # pragma: nocover
-                    # so that vectorcode doesn't break on old collections.
-                    continue
-                documents[identifier].append(distance)
-        logger.debug("Document scores: %s", documents)
-        top_k = int(numpy.mean(tuple(len(i) for i in documents.values())))
-        for key in documents.keys():
-            documents[key] = heapq.nsmallest(top_k, documents[key])
-
-        return heapq.nsmallest(
-            self.n_result, documents.keys(), lambda x: float(numpy.mean(documents[x]))
-        )
+    def compute_similarity(
+        self, results: list[str], query_message: str
+    ) -> Sequence[float]:
+        assert self._raw_results is not None, "Expecting raw results from the database."
+        assert self._raw_results.get("distances") is not None
+        assert self.configs.query, "Expecting query messages in self.configs"
+        idx = self.configs.query.index(query_message)
+        return list(-i for i in self._raw_results.get("distances")[idx])
diff --git a/tests/subcommands/query/test_reranker.py b/tests/subcommands/query/test_reranker.py
@@ -1,5 +1,6 @@
 from unittest.mock import MagicMock, patch
 
+import numpy
 import pytest
 
 from vectorcode.cli_utils import Config, QueryInclude
@@ -94,24 +95,6 @@ def test_naive_reranker_rerank(naive_reranker_conf, query_result):
         assert isinstance(path, str)
 
 
-def test_naive_reranker_handles_none_path(config, query_result):
-    """Test NaiveReranker properly handles None paths in metadata"""
-    # Create a copy with a None path
-    query_result_with_none = query_result.copy()
-    query_result_with_none["metadatas"] = [
-        [{"path": "file1.py"}, {"path": None}, {"path": "file3.py"}],
-        [{"path": "file2.py"}, {"path": "file4.py"}, {"path": "file3.py"}],
-    ]
-
-    reranker = NaiveReranker(config)
-    result = reranker.rerank(query_result_with_none)
-
-    # Check the None path was handled without errors
-    assert isinstance(result, list)
-    # None should be filtered out
-    assert None not in result
-
-
 @patch("sentence_transformers.CrossEncoder")
 def test_cross_encoder_reranker_initialization(mock_cross_encoder: MagicMock, config):
     reranker = CrossEncoderReranker(config)
@@ -141,43 +124,34 @@ def test_cross_encoder_reranker_rerank(
     mock_model = MagicMock()
     mock_cross_encoder.return_value = mock_model
 
-    # Configure mock rank method to return predetermined ranks
-    mock_model.rank.return_value = [
-        {"corpus_id": 0, "score": 0.9},
-        {"corpus_id": 1, "score": 0.7},
-        {"corpus_id": 2, "score": 0.8},
-    ]
+    # Configure mock predict to return numpy array with float32 dtype
+    scores = numpy.array([0.9, 0.7, 0.8], dtype=numpy.float32)
+    mock_model.predict.return_value = scores
 
-    reranker = CrossEncoderReranker(config)
+    # Ensure complete query_result structure
+    query_result.update(
+        {
+            "ids": [["id1", "id2", "id3"], ["id4", "id5", "id6"]],
+            "documents": [["doc1", "doc2", "doc3"], ["doc4", "doc5", "doc6"]],
+            "metadatas": [
+                [{"path": "p1"}, {"path": "p2"}, {"path": "p3"}],
+                [{"path": "p4"}, {"path": "p5"}, {"path": "p6"}],
+            ],
+        }
+    )
 
+    reranker = CrossEncoderReranker(config)
     result = reranker.rerank(query_result)
 
-    # Verify the model was called with correct parameters
-    mock_model.rank.assert_called()
-
-    # Check result
+    # Result assertions
     assert isinstance(result, list)
+    assert all(isinstance(path, str) for path in result)
     assert len(result) <= config.n_result
 
-    # Check all returned items are strings (paths)
-    for path in result:
-        assert isinstance(path, str)
-
 
-def test_naive_reranker_document_selection_logic(naive_reranker_conf):
+def test_naive_reranker_document_selection_logic(naive_reranker_conf, query_result):
     """Test that NaiveReranker correctly selects documents based on distances"""
     # Create a query result with known distances
-    query_result = {
-        "ids": [["id1", "id2", "id3"], ["id4", "id5", "id6"]],
-        "distances": [
-            [0.3, 0.1, 0.2],  # file2 has lowest, then file3, then file1
-            [0.6, 0.4, 0.5],  # file4 has lowest, then file3, then file2
-        ],
-        "metadatas": [
-            [{"path": "file1.py"}, {"path": "file2.py"}, {"path": "file3.py"}],
-            [{"path": "file2.py"}, {"path": "file4.py"}, {"path": "file3.py"}],
-        ],
-    }
 
     reranker = NaiveReranker(naive_reranker_conf)
     result = reranker.rerank(query_result)
@@ -188,19 +162,12 @@ def test_naive_reranker_document_selection_logic(naive_reranker_conf):
     assert "file2.py" in result or "file3.py" in result
 
 
-def test_naive_reranker_with_chunk_ids(naive_reranker_conf):
+def test_naive_reranker_with_chunk_ids(naive_reranker_conf, query_result):
     """Test NaiveReranker returns chunk IDs when QueryInclude.chunk is set"""
     naive_reranker_conf.include.append(
         QueryInclude.chunk
     )  # Assuming QueryInclude.chunk would be "chunk"
-    query_result = {
-        "ids": [["id1", "id2"], ["id3", "id1"]],
-        "distances": [[0.1, 0.2], [0.3, 0.4]],
-        "metadatas": [
-            [{"path": "file1.py"}, {"path": "file2.py"}],
-            [{"path": "file3.py"}, {"path": "file1.py"}],
-        ],
-    }
+
     reranker = NaiveReranker(naive_reranker_conf)
     result = reranker.rerank(query_result)
 
@@ -212,33 +179,22 @@ def test_naive_reranker_with_chunk_ids(naive_reranker_conf):
 
 @patch("sentence_transformers.CrossEncoder")
 def test_cross_encoder_reranker_with_chunk_ids(
-    mock_cross_encoder, config, query_chunks
+    mock_cross_encoder, config, query_result
 ):
     """Test CrossEncoderReranker returns chunk IDs when QueryInclude.chunk is set"""
     mock_model = MagicMock()
     mock_cross_encoder.return_value = mock_model
-    mock_model.rank.return_value = [
-        {"corpus_id": 0, "score": 0.9},
-        {"corpus_id": 1, "score": 0.7},
-    ]
-
-    config.include = {"chunk"}  # Use comma instead of append
-    reranker = CrossEncoderReranker(
-        config,
-    )
 
-    # Match query_chunks length with results
-    result = reranker.rerank(
-        {
-            "ids": [["id1", "id2"], ["id3", "id4"]],  # Two query chunks
-            "metadatas": [
-                [{"path": "file1.py"}, {"path": "file2.py"}],
-                [{"path": "file3.py"}, {"path": "file4.py"}],
-            ],
-            "documents": [["doc1", "doc2"], ["doc3", "doc4"]],
-        },
-    )
+    # Setup mock to return numpy array scores
+    scores = numpy.array([0.9, 0.7], dtype=numpy.float32)
+    mock_model.predict.return_value = scores
+
+    config.include = {QueryInclude.chunk}
+    reranker = CrossEncoderReranker(config)
 
+    result = reranker.rerank(query_result)
+
+    mock_model.predict.assert_called()
     assert isinstance(result, list)
     assert all(isinstance(id, str) for id in result)
     assert all(id in ["id1", "id2", "id3", "id4"] for id in result)
@@ -275,12 +231,15 @@ def test_add_reranker_success():
 
     @add_reranker
     class TestReranker(RerankerBase):
-        def rerank(self, results, query_chunks):
+        def compute_similarity(self, results, query_message):
             return []
 
     assert len(get_available_rerankers()) == original_count + 1
     assert "TestReranker" in __supported_rerankers
-    assert isinstance(get_reranker(Config(reranker="TestReranker")), TestReranker)
+    assert isinstance(
+        get_reranker(Config(reranker="TestReranker", query=["hello world"])),
+        TestReranker,
+    )
     __supported_rerankers.pop("TestReranker")