Davidyz
diff --git a/‎docs/cli.md‎
Lines changed: 18 additions & 7 deletions b/‎docs/cli.md‎
Lines changed: 18 additions & 7 deletions
diff --git a/‎src/vectorcode/cli_utils.py‎
Lines changed: 25 additions & 14 deletions b/‎src/vectorcode/cli_utils.py‎
Lines changed: 25 additions & 14 deletions
diff --git a/‎src/vectorcode/subcommands/query/__init__.py‎
Lines changed: 3 additions & 11 deletions b/‎src/vectorcode/subcommands/query/__init__.py‎
Lines changed: 3 additions & 11 deletions
diff --git a/‎src/vectorcode/subcommands/query/reranker.py‎
Lines changed: 0 additions & 95 deletions b/‎src/vectorcode/subcommands/query/reranker.py‎
Lines changed: 0 additions & 95 deletions
diff --git a/‎src/vectorcode/subcommands/query/reranker/__init__.py‎
Lines changed: 76 additions & 0 deletions b/‎src/vectorcode/subcommands/query/reranker/__init__.py‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎src/vectorcode/subcommands/query/reranker/base.py‎
Lines changed: 95 additions & 0 deletions b/‎src/vectorcode/subcommands/query/reranker/base.py‎
Lines changed: 95 additions & 0 deletions
@@ -248,14 +248,25 @@ The JSON configuration file may hold the following values:
   guarantees the return of `n` documents, but with the risk of including too
   many less-relevant chunks that may affect the document selection. Default: 
   `-1` (any negative value means selecting documents based on all indexed chunks);
-- `reranker`: string, a reranking model supported by 
-  [`CrossEncoder`](https://sbert.net/docs/package_reference/cross_encoder/index.html). 
-  A list of available models is available on their documentation. The default
-  model is `"cross-encoder/ms-marco-MiniLM-L-6-v2"`. You can disable the use of
-  `CrossEncoder` by setting this option to a falsy value that is not `null`,
-  such as `false` or `""` (empty string);
+- `reranker`: string, the reranking method to use. Currently supports
+  `CrossEncoderReranker` (default, using 
+  [sentence-transformers cross-encoder](https://sbert.net/docs/package_reference/cross_encoder/cross_encoder.html)
+  ) and `NaiveReranker` (sort chunks by the "distance" between the embedding
+  vectors);
 - `reranker_params`: dictionary, similar to `embedding_params`. The options
-  passed to `CrossEncoder` class constructor;
+  passed to the reranker class constructor. For `CrossEncoderReranker`, these
+  are the options passed to the 
+  [`CrossEncoder`](https://sbert.net/docs/package_reference/cross_encoder/cross_encoder.html#id1)
+  class. For example, if you want to use a non-default model, you can use the
+  following:
+  ```json
+  {
+    "reranker_params": {
+      "model_name_or_path": "your_model_here"
+    }
+  }
+  ```
+  ;
 - `db_settings`: dictionary, works in a similar way to `embedding_params`, but 
   for Chromadb client settings so that you can configure 
   [authentication for remote Chromadb](https://docs.trychroma.com/production/administration/auth);
 
@@ -85,8 +85,8 @@ class Config:
     overlap_ratio: float = 0.2
     query_multiplier: int = -1
     query_exclude: list[PathLike] = field(default_factory=list)
-    reranker: Optional[str] = "cross-encoder/ms-marco-MiniLM-L-6-v2"
-    reranker_params: dict[str, Any] = field(default_factory=dict)
+    reranker: Optional[str] = "CrossEncoderReranker"
+    reranker_params: dict[str, Any] = field(default_factory=lambda: {})
     check_item: Optional[str] = None
     use_absolute_path: bool = False
     include: list[QueryInclude] = field(
@@ -100,6 +100,7 @@ async def import_from(cls, config_dict: dict[str, Any]) -> "Config":
         """
         Raise IOError if db_path is not valid.
         """
+        default_config = Config()
         db_path = config_dict.get("db_path")
         host = config_dict.get("host") or "localhost"
         port = config_dict.get("port") or 8000
@@ -112,25 +113,35 @@ async def import_from(cls, config_dict: dict[str, Any]) -> "Config":
         return Config(
             **{
                 "embedding_function": config_dict.get(
-                    "embedding_function", "SentenceTransformerEmbeddingFunction"
+                    "embedding_function", default_config.embedding_function
+                ),
+                "embedding_params": config_dict.get(
+                    "embedding_params", default_config.embedding_params
                 ),
-                "embedding_params": config_dict.get("embedding_params", {}),
                 "host": host,
                 "port": port,
                 "db_path": db_path,
                 "db_log_path": os.path.expanduser(
-                    config_dict.get("db_log_path", "~/.local/share/vectorcode/")
+                    config_dict.get("db_log_path", default_config.db_log_path)
+                ),
+                "chunk_size": config_dict.get("chunk_size", default_config.chunk_size),
+                "overlap_ratio": config_dict.get(
+                    "overlap_ratio", default_config.overlap_ratio
+                ),
+                "query_multiplier": config_dict.get(
+                    "query_multiplier", default_config.query_multiplier
+                ),
+                "reranker": config_dict.get("reranker", default_config.reranker),
+                "reranker_params": config_dict.get(
+                    "reranker_params", default_config.reranker_params
+                ),
+                "db_settings": config_dict.get(
+                    "db_settings", default_config.db_settings
                 ),
-                "chunk_size": config_dict.get("chunk_size", 2500),
-                "overlap_ratio": config_dict.get("overlap_ratio", 0.2),
-                "query_multiplier": config_dict.get("query_multiplier", -1),
-                "reranker": config_dict.get(
-                    "reranker", "cross-encoder/ms-marco-MiniLM-L-6-v2"
+                "hnsw": config_dict.get("hnsw", default_config.hnsw),
+                "chunk_filters": config_dict.get(
+                    "chunk_filters", default_config.chunk_filters
                 ),
-                "reranker_params": config_dict.get("reranker_params", {}),
-                "db_settings": config_dict.get("db_settings", None),
-                "hnsw": config_dict.get("hnsw", {}),
-                "chunk_filters": config_dict.get("chunk_filters", {}),
             }
         )
 
 
@@ -14,6 +14,7 @@
     get_collection,
     verify_ef,
 )
+from vectorcode.subcommands.query.reranker import get_reranker
 
 logger = logging.getLogger(name=__name__)
 
@@ -66,17 +67,8 @@ async def get_query_result_files(
         # no results found
         return []
 
-    if not configs.reranker:
-        from .reranker import NaiveReranker
-
-        aggregated_results = NaiveReranker(configs).rerank(results)
-    else:
-        from .reranker import CrossEncoderReranker
-
-        aggregated_results = CrossEncoderReranker(
-            configs, query_chunks, configs.reranker, **configs.reranker_params
-        ).rerank(results)
-    return aggregated_results
+    reranker = get_reranker(configs)
+    return await reranker.rerank(results)
 
 
 async def build_query_results(
 
@@ -0,0 +1,76 @@
+import logging
+import sys
+from typing import Type
+
+from vectorcode.cli_utils import Config
+
+from .base import RerankerBase
+from .cross_encoder import CrossEncoderReranker
+from .naive import NaiveReranker
+
+__all__ = ["RerankerBase", "NaiveReranker", "CrossEncoderReranker"]
+
+logger = logging.getLogger(name=__name__)
+
+__supported_rerankers: dict[str, Type[RerankerBase]] = {
+    "CrossEncoderReranker": CrossEncoderReranker,
+    "NaiveReranker": NaiveReranker,
+}
+
+
+def add_reranker(cls):
+    """
+    This is a class decorator that allows you to add a custom reranker that can be
+    recognised by the `get_reranker` function.
+
+    Your reranker should inherit `RerankerBase` and be decorated by `add_reranker`:
+    ```python
+    @add_reranker
+    class CustomReranker(RerankerBase):
+        # override the methods according to your need.
+    ```
+    """
+    if issubclass(cls, RerankerBase):
+        if __supported_rerankers.get(cls.__name__):
+            error_message = f"{cls.__name__} has been registered."
+            logger.error(error_message)
+            raise AttributeError(error_message)
+        __supported_rerankers[cls.__name__] = cls
+        return cls
+    else:
+        error_message = f'{cls} should be a subclass of "RerankerBase"'
+        logger.error(error_message)
+        raise TypeError(error_message)
+
+
+def get_available_rerankers():
+    return list(__supported_rerankers.values())
+
+
+def get_reranker(configs: Config) -> RerankerBase:
+    if configs.reranker:
+        if hasattr(sys.modules[__name__], configs.reranker):
+            # dynamic dispatch for built-in rerankers
+            return getattr(sys.modules[__name__], configs.reranker).create(configs)
+
+        elif issubclass(
+            __supported_rerankers.get(configs.reranker, type(None)), RerankerBase
+        ):
+            return __supported_rerankers[configs.reranker].create(configs)
+
+    # TODO: replace the following with an Exception before the release of 0.6.0.
+    logger.warning(
+        f""""reranker" option should be set to one of the following: {list(i.__name__ for i in get_available_rerankers())}.
+To choose a CrossEncoderReranker model, you can set the "model_name_or_path" key in the "reranker_params" option to the name/path of the model.
+To use NaiveReranker, set the "reranker" option to "NaiveReranker".
+The old configuration syntax will be DEPRECATED in v0.6.0
+                """
+    )
+    if not configs.reranker:
+        return NaiveReranker(configs)
+    else:
+        configs.reranker_params.update({"model_name_or_path": configs.reranker})
+        configs.reranker = "CrossEncoderReranker"
+        return CrossEncoderReranker(
+            configs,
+        )
@@ -0,0 +1,95 @@
+import heapq
+import logging
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from typing import Any, DefaultDict, Optional, Sequence, cast
+
+import numpy
+from chromadb.api.types import QueryResult
+
+from vectorcode.cli_utils import Config, QueryInclude
+
+logger = logging.getLogger(name=__name__)
+
+
+class RerankerBase(ABC):
+    """This is the base class for the rerankers.
+    You should use the configs.reranker_params field to store and pass the parameters used for your reranker.
+    You should implement the `compute_similarity` method, which will be called by `rerank` to compute
+    similarity scores between search query and results.
+    The items in the returned list should be sorted such that the relevance decreases along the list.
+
+    The class doc string will be added to the error message if your reranker fails to initialise.
+    Thus, this is a good place to put the instructions to configuring your reranker.
+    """
+
+    def __init__(self, configs: Config, **kwargs: Any):
+        self.configs = configs
+        assert self.configs.query is not None, (
+            "'configs' should contain the query messages."
+        )
+        self.n_result = configs.n_result
+        self._raw_results: Optional[QueryResult] = None
+
+    @classmethod
+    def create(cls, configs: Config, **kwargs: Any):
+        try:
+            return cls(configs, **kwargs)
+        except Exception as e:
+            e.add_note(
+                "\n"
+                + (
+                    cls.__doc__
+                    or f"There was an issue initialising {cls}. Please doublecheck your configuration."
+                )
+            )
+            raise
+
+    @abstractmethod
+    async def compute_similarity(
+        self, results: list[str], query_message: str
+    ) -> Sequence[float]:  # pragma: nocover
+        """Given a list of n results and 1 query message,
+        return a list-like object of length n that contains the similarity scores between
+        each item in `results` and the `query_message`.
+
+        A high similarity score means the strings are semantically similar to each other.
+        `query_message` will be loaded in the same order as they appear in `self.configs.query`.
+
+        If you need the raw query results from chromadb,
+        it'll be saved in `self._raw_results` before this method is called.
+        """
+        raise NotImplementedError
+
+    async def rerank(self, results: QueryResult | dict) -> list[str]:
+        self._raw_results = cast(QueryResult, results)
+        query_chunks = self.configs.query
+        assert query_chunks
+        assert results["metadatas"] is not None
+        assert results["documents"] is not None
+        documents: DefaultDict[str, list[float]] = defaultdict(list)
+        for query_chunk_idx in range(len(query_chunks)):
+            chunk_ids = results["ids"][query_chunk_idx]
+            chunk_metas = results["metadatas"][query_chunk_idx]
+            chunk_docs = results["documents"][query_chunk_idx]
+            scores = await self.compute_similarity(
+                chunk_docs, query_chunks[query_chunk_idx]
+            )
+            for i, score in enumerate(scores):
+                if QueryInclude.chunk in self.configs.include:
+                    documents[chunk_ids[i]].append(float(score))
+                else:
+                    documents[str(chunk_metas[i]["path"])].append(float(score))
+
+        logger.debug("Document scores: %s", documents)
+        top_k = int(numpy.mean(tuple(len(i) for i in documents.values())))
+        for key in documents.keys():
+            documents[key] = heapq.nlargest(top_k, documents[key])
+
+        self._raw_results = None
+
+        return heapq.nlargest(
+            self.n_result,
+            documents.keys(),
+            key=lambda x: float(numpy.mean(documents[x])),
+        )