simplify the process of building a new reranker by creating a compute_similarity method

Davidyz · Davidyz · commit d61a68a810eb · 2025-04-21T11:33:54.000+08:00
diff --git a/src/vectorcode/subcommands/query/reranker/base.py b/src/vectorcode/subcommands/query/reranker/base.py
@@ -1,10 +1,13 @@
+import heapq
 import logging
 from abc import ABC, abstractmethod
-from typing import Any
+from collections import defaultdict
+from typing import Any, DefaultDict, Optional, Sequence, cast
 
+import numpy
 from chromadb.api.types import QueryResult
 
-from vectorcode.cli_utils import Config
+from vectorcode.cli_utils import Config, QueryInclude
 
 logger = logging.getLogger(name=__name__)
 
@@ -21,7 +24,11 @@ class RerankerBase(ABC):
 
     def __init__(self, configs: Config, **kwargs: Any):
         self.configs = configs
+        assert self.configs.query is not None, (
+            "'configs' should contain the query messages."
+        )
         self.n_result = configs.n_result
+        self._raw_results: Optional[QueryResult] = None
 
     @classmethod
     def create(cls, configs: Config, **kwargs: Any):
@@ -38,5 +45,43 @@ def create(cls, configs: Config, **kwargs: Any):
             raise
 
     @abstractmethod
-    def rerank(self, results: QueryResult) -> list[str]:  # pragma: nocover
+    def compute_similarity(
+        self, results: list[str], query_message: str
+    ) -> Sequence:  # pragma: nocover
+        """Given a list of n results and 1 query message,
+        return a list-like object of length n that contains the similarity scores between
+        each item in `results` and the `query_message`.
+
+        A high similarity score means the strings are semantically similar to each other.
+        `query_message` will be loaded in the same order as they appear in `self.configs.query`
+        """
         raise NotImplementedError
+
+    def rerank(self, results: QueryResult | dict) -> list[str]:
+        self._raw_results = cast(QueryResult, results)
+        query_chunks = self.configs.query
+        assert query_chunks
+        assert results["metadatas"] is not None
+        assert results["documents"] is not None
+        documents: DefaultDict[str, list[float]] = defaultdict(list)
+        for query_chunk_idx in range(len(query_chunks)):
+            chunk_ids = results["ids"][query_chunk_idx]
+            chunk_metas = results["metadatas"][query_chunk_idx]
+            chunk_docs = results["documents"][query_chunk_idx]
+            scores = self.compute_similarity(chunk_docs, query_chunks[query_chunk_idx])
+            for i, score in enumerate(scores):
+                if QueryInclude.chunk in self.configs.include:
+                    documents[chunk_ids[i]].append(float(score))
+                else:
+                    documents[chunk_metas[i]["path"]].append(float(score))
+
+        logger.debug("Document scores: %s", documents)
+        top_k = int(numpy.mean(tuple(len(i) for i in documents.values())))
+        for key in documents.keys():
+            documents[key] = heapq.nlargest(top_k, documents[key])
+
+        return heapq.nlargest(
+            self.n_result,
+            documents.keys(),
+            key=lambda x: float(numpy.mean(documents[x])),
+        )
diff --git a/src/vectorcode/subcommands/query/reranker/cross_encoder.py b/src/vectorcode/subcommands/query/reranker/cross_encoder.py
@@ -1,11 +1,7 @@
-import heapq
 import logging
-from collections import defaultdict
-from typing import Any, DefaultDict
+from typing import Any
 
-import numpy
-
-from vectorcode.cli_utils import Config, QueryInclude
+from vectorcode.cli_utils import Config
 
 from .base import RerankerBase
 
@@ -25,9 +21,6 @@ def __init__(
         **kwargs: Any,
     ):
         super().__init__(configs)
-        assert self.configs.query is not None, (
-            "'configs' should contain the query messages."
-        )
         from sentence_transformers import CrossEncoder
 
         if configs.reranker_params.get("model_name_or_path") is None:
@@ -39,33 +32,8 @@ def __init__(
             )
         self.model = CrossEncoder(**configs.reranker_params)
 
-    def rerank(self, results) -> list[str]:
-        assert self.configs.query
-        query_chunks = self.configs.query
-        assert results["metadatas"] is not None
-        assert results["documents"] is not None
-        documents: DefaultDict[str, list[float]] = defaultdict(list)
-        for query_chunk_idx in range(len(query_chunks)):
-            chunk_ids = results["ids"][query_chunk_idx]
-            chunk_metas = results["metadatas"][query_chunk_idx]
-            chunk_docs = results["documents"][query_chunk_idx]
-            ranks = self.model.rank(
-                query_chunks[query_chunk_idx], chunk_docs, apply_softmax=True
-            )
-            for rank in ranks:
-                if QueryInclude.chunk in self.configs.include:
-                    documents[chunk_ids[rank["corpus_id"]]].append(float(rank["score"]))
-                else:
-                    documents[chunk_metas[rank["corpus_id"]]["path"]].append(
-                        float(rank["score"])
-                    )
-        logger.debug("Document scores: %s", documents)
-        top_k = int(numpy.mean(tuple(len(i) for i in documents.values())))
-        for key in documents.keys():
-            documents[key] = heapq.nlargest(top_k, documents[key])
-
-        return heapq.nlargest(
-            self.n_result,
-            documents.keys(),
-            key=lambda x: float(numpy.mean(documents[x])),
+    def compute_similarity(self, results: list[str], query_message: str):
+        return list(
+            float(i)
+            for i in self.model.predict([(chunk, query_message) for chunk in results])
         )
diff --git a/src/vectorcode/subcommands/query/reranker/naive.py b/src/vectorcode/subcommands/query/reranker/naive.py
@@ -1,11 +1,7 @@
-import heapq
 import logging
-from collections import defaultdict
-from typing import Any, DefaultDict
+from typing import Any, Sequence
 
-import numpy
-
-from vectorcode.cli_utils import Config, QueryInclude
+from vectorcode.cli_utils import Config
 
 from .base import RerankerBase
 
@@ -21,30 +17,11 @@ class NaiveReranker(RerankerBase):
     def __init__(self, configs: Config, **kwargs: Any):
         super().__init__(configs)
 
-    def rerank(self, results) -> list[str]:
-        assert results["metadatas"] is not None
-        assert results["distances"] is not None
-        documents: DefaultDict[str, list[float]] = defaultdict(list)
-        for query_chunk_idx in range(len(results["ids"])):
-            chunk_ids = results["ids"][query_chunk_idx]
-            chunk_metas = results["metadatas"][query_chunk_idx]
-            chunk_distances = results["distances"][query_chunk_idx]
-            # NOTE: distances, smaller is better.
-            paths = [str(meta["path"]) for meta in chunk_metas]
-            assert len(paths) == len(chunk_distances)
-            for distance, identifier in zip(
-                chunk_distances,
-                chunk_ids if QueryInclude.chunk in self.configs.include else paths,
-            ):
-                if identifier is None:  # pragma: nocover
-                    # so that vectorcode doesn't break on old collections.
-                    continue
-                documents[identifier].append(distance)
-        logger.debug("Document scores: %s", documents)
-        top_k = int(numpy.mean(tuple(len(i) for i in documents.values())))
-        for key in documents.keys():
-            documents[key] = heapq.nsmallest(top_k, documents[key])
-
-        return heapq.nsmallest(
-            self.n_result, documents.keys(), lambda x: float(numpy.mean(documents[x]))
-        )
+    def compute_similarity(
+        self, results: list[str], query_message: str
+    ) -> Sequence[float]:
+        assert self._raw_results is not None, "Expecting raw results from the database."
+        assert self._raw_results.get("distances") is not None
+        assert self.configs.query, "Expecting query messages in self.configs"
+        idx = self.configs.query.index(query_message)
+        return list(-i for i in self._raw_results.get("distances")[idx])
diff --git a/tests/subcommands/query/test_reranker.py b/tests/subcommands/query/test_reranker.py
@@ -1,5 +1,6 @@
 from unittest.mock import MagicMock, patch
 
+import numpy
 import pytest
 
 from vectorcode.cli_utils import Config, QueryInclude
@@ -94,24 +95,6 @@ def test_naive_reranker_rerank(naive_reranker_conf, query_result):
         assert isinstance(path, str)
 
 
-def test_naive_reranker_handles_none_path(config, query_result):
-    """Test NaiveReranker properly handles None paths in metadata"""
-    # Create a copy with a None path
-    query_result_with_none = query_result.copy()
-    query_result_with_none["metadatas"] = [
-        [{"path": "file1.py"}, {"path": None}, {"path": "file3.py"}],
-        [{"path": "file2.py"}, {"path": "file4.py"}, {"path": "file3.py"}],
-    ]
-
-    reranker = NaiveReranker(config)
-    result = reranker.rerank(query_result_with_none)
-
-    # Check the None path was handled without errors
-    assert isinstance(result, list)
-    # None should be filtered out
-    assert None not in result
-
-
 @patch("sentence_transformers.CrossEncoder")
 def test_cross_encoder_reranker_initialization(mock_cross_encoder: MagicMock, config):
     reranker = CrossEncoderReranker(config)
@@ -141,43 +124,34 @@ def test_cross_encoder_reranker_rerank(
     mock_model = MagicMock()
     mock_cross_encoder.return_value = mock_model
 
-    # Configure mock rank method to return predetermined ranks
-    mock_model.rank.return_value = [
-        {"corpus_id": 0, "score": 0.9},
-        {"corpus_id": 1, "score": 0.7},
-        {"corpus_id": 2, "score": 0.8},
-    ]
+    # Configure mock predict to return numpy array with float32 dtype
+    scores = numpy.array([0.9, 0.7, 0.8], dtype=numpy.float32)
+    mock_model.predict.return_value = scores
 
-    reranker = CrossEncoderReranker(config)
+    # Ensure complete query_result structure
+    query_result.update(
+        {
+            "ids": [["id1", "id2", "id3"], ["id4", "id5", "id6"]],
+            "documents": [["doc1", "doc2", "doc3"], ["doc4", "doc5", "doc6"]],
+            "metadatas": [
+                [{"path": "p1"}, {"path": "p2"}, {"path": "p3"}],
+                [{"path": "p4"}, {"path": "p5"}, {"path": "p6"}],
+            ],
+        }
+    )
 
+    reranker = CrossEncoderReranker(config)
     result = reranker.rerank(query_result)
 
-    # Verify the model was called with correct parameters
-    mock_model.rank.assert_called()
-
-    # Check result
+    # Result assertions
     assert isinstance(result, list)
+    assert all(isinstance(path, str) for path in result)
     assert len(result) <= config.n_result
 
-    # Check all returned items are strings (paths)
-    for path in result:
-        assert isinstance(path, str)
-
 
-def test_naive_reranker_document_selection_logic(naive_reranker_conf):
+def test_naive_reranker_document_selection_logic(naive_reranker_conf, query_result):
     """Test that NaiveReranker correctly selects documents based on distances"""
     # Create a query result with known distances
-    query_result = {
-        "ids": [["id1", "id2", "id3"], ["id4", "id5", "id6"]],
-        "distances": [
-            [0.3, 0.1, 0.2],  # file2 has lowest, then file3, then file1
-            [0.6, 0.4, 0.5],  # file4 has lowest, then file3, then file2
-        ],
-        "metadatas": [
-            [{"path": "file1.py"}, {"path": "file2.py"}, {"path": "file3.py"}],
-            [{"path": "file2.py"}, {"path": "file4.py"}, {"path": "file3.py"}],
-        ],
-    }
 
     reranker = NaiveReranker(naive_reranker_conf)
     result = reranker.rerank(query_result)
@@ -188,19 +162,12 @@ def test_naive_reranker_document_selection_logic(naive_reranker_conf):
     assert "file2.py" in result or "file3.py" in result
 
 
-def test_naive_reranker_with_chunk_ids(naive_reranker_conf):
+def test_naive_reranker_with_chunk_ids(naive_reranker_conf, query_result):
     """Test NaiveReranker returns chunk IDs when QueryInclude.chunk is set"""
     naive_reranker_conf.include.append(
         QueryInclude.chunk
     )  # Assuming QueryInclude.chunk would be "chunk"
-    query_result = {
-        "ids": [["id1", "id2"], ["id3", "id1"]],
-        "distances": [[0.1, 0.2], [0.3, 0.4]],
-        "metadatas": [
-            [{"path": "file1.py"}, {"path": "file2.py"}],
-            [{"path": "file3.py"}, {"path": "file1.py"}],
-        ],
-    }
+
     reranker = NaiveReranker(naive_reranker_conf)
     result = reranker.rerank(query_result)
 
@@ -212,33 +179,22 @@ def test_naive_reranker_with_chunk_ids(naive_reranker_conf):
 
 @patch("sentence_transformers.CrossEncoder")
 def test_cross_encoder_reranker_with_chunk_ids(
-    mock_cross_encoder, config, query_chunks
+    mock_cross_encoder, config, query_result
 ):
     """Test CrossEncoderReranker returns chunk IDs when QueryInclude.chunk is set"""
     mock_model = MagicMock()
     mock_cross_encoder.return_value = mock_model
-    mock_model.rank.return_value = [
-        {"corpus_id": 0, "score": 0.9},
-        {"corpus_id": 1, "score": 0.7},
-    ]
-
-    config.include = {"chunk"}  # Use comma instead of append
-    reranker = CrossEncoderReranker(
-        config,
-    )
 
-    # Match query_chunks length with results
-    result = reranker.rerank(
-        {
-            "ids": [["id1", "id2"], ["id3", "id4"]],  # Two query chunks
-            "metadatas": [
-                [{"path": "file1.py"}, {"path": "file2.py"}],
-                [{"path": "file3.py"}, {"path": "file4.py"}],
-            ],
-            "documents": [["doc1", "doc2"], ["doc3", "doc4"]],
-        },
-    )
+    # Setup mock to return numpy array scores
+    scores = numpy.array([0.9, 0.7], dtype=numpy.float32)
+    mock_model.predict.return_value = scores
+
+    config.include = {QueryInclude.chunk}
+    reranker = CrossEncoderReranker(config)
 
+    result = reranker.rerank(query_result)
+
+    mock_model.predict.assert_called()
     assert isinstance(result, list)
     assert all(isinstance(id, str) for id in result)
     assert all(id in ["id1", "id2", "id3", "id4"] for id in result)
@@ -275,12 +231,15 @@ def test_add_reranker_success():
 
     @add_reranker
     class TestReranker(RerankerBase):
-        def rerank(self, results, query_chunks):
+        def compute_similarity(self, results, query_message):
             return []
 
     assert len(get_available_rerankers()) == original_count + 1
     assert "TestReranker" in __supported_rerankers
-    assert isinstance(get_reranker(Config(reranker="TestReranker")), TestReranker)
+    assert isinstance(
+        get_reranker(Config(reranker="TestReranker", query=["hello world"])),
+        TestReranker,
+    )
     __supported_rerankers.pop("TestReranker")