cleanup and documentation.

Davidyz · Davidyz · commit 27c7cf0ce65b · 2025-05-21T11:04:29.000+01:00
diff --git a/docs/cli.md b/docs/cli.md
@@ -18,6 +18,7 @@
   * [Vectorising Your Code](#vectorising-your-code)
     * [File Specs](#file-specs)
   * [Making a Query](#making-a-query)
+    * [Query Rewriting](#query-rewriting)
   * [Listing All Collections](#listing-all-collections)
   * [Removing a Collection](#removing-a-collection)
   * [Checking Project Setup](#checking-project-setup)
@@ -356,7 +357,13 @@ The JSON configuration file may hold the following values:
   command line flag. You can also set this to `_auto`, which uses
   [charset-normalizer](https://charset-normalizer.readthedocs.io/en/latest/index.html)
   to automatically detect the encoding, but this is not very accurate,
-  especially on small files.
+  especially on small files;
+- `rewriter`: string, the type of rewriter to use. Currently the only supported
+  value is `OpenAIRewriter`, which uses a openai-compatible LLM provider as the
+  rewriter;
+- `rewriter_params`: dictionary, the options to be used for the construction of
+  the rewriter. The options are documented in [the source
+  code](../src/vectorcode/rewriter/).
 
 See 
 [the wiki](https://github.com/Davidyz/VectorCode/wiki/Default-Configuration#default-cli-configuration) 
@@ -461,6 +468,47 @@ the number of retrieved chunks when you use `--include chunk`. For the sake of
 completeness, the first and last lines of a chunk will be completed to include
 the whole lines if the chunker broke the text from mid-line.
 
+#### Query Rewriting
+
+When your query messages are noisy (for example, containing a lot of symbols
+that are not relevant to the RAG tasks), the retrieval results may be
+compromised. To address this, you can try to use 
+[query rewriting](https://docs.llamaindex.ai/en/stable/examples/query_transformations/query_transform_cookbook/#query-rewriting).
+The VectorCode implementation of query rewriting uses an LLM to rewrite your
+search query so that it contains a curated list of keywords and (hopefully) will
+improve your search results. To do this, you'd need to [configure your rewriter](#configuring-vectorcode) 
+and pass the `--rewrite` flag to your query command. For example: 
+```json5
+// .vectorcode/config.json
+// `OpenAIRewriter` works for any openai-compatible LLM API service that works
+// provides structured_output.
+{
+  "rewriter": "OpenAIRewriter",
+  "rewriter_params": {
+    "client_kwargs": { 
+      // see openai.Client
+      // https://github.com/openai/openai-python/blob/67997a4ec1ebcdf8e740afb0d0b2e37897657bde/src/openai/_client.py#L80
+      "base_url": "https://api.siliconflow.cn/v1",
+      "api_key": "$SILICONFLOW_API_KEY"
+    },
+    "completion_kwargs": { 
+      // see openai.Client.beta.chat.completions.parse
+      // https://github.com/openai/openai-python/blob/main/helpers.md#structured-outputs-parsing-helpers
+      "model": "Qwen/Qwen2.5-7B-Instruct",
+      "temperature": 0
+    }
+  }
+}
+```
+
+And when making a query, if you pass the `--rewrite` flag, VectorCode will send
+your query message to the LLM and get a list of strings, which it will use as
+the query for the search:
+
+```bash
+vectorcode query "reranker implementation" "class" "struct" "transformers" --rewrite
+```
+
 ### Listing All Collections
 
 You can use `vectorcode ls` command to list all collections in your ChromaDB.
diff --git a/src/vectorcode/rewriter/__init__.py b/src/vectorcode/rewriter/__init__.py
@@ -25,5 +25,5 @@ def get_rewriter(configs: Config) -> Optional[RewriterBase]:
         rewriter_cls = getattr(sys.modules[__name__], configs.rewriter)
         if issubclass(rewriter_cls, RewriterBase):
             logger.info(f"Loaded {configs.rewriter}")
-            return rewriter_cls(configs)
+            return rewriter_cls.create(configs)
     raise RewriterError(f"Failed to find {configs.rewriter}!")
diff --git a/src/vectorcode/rewriter/base.py b/src/vectorcode/rewriter/base.py
@@ -8,6 +8,20 @@ def __init__(self, config: Config) -> None:
         super().__init__()
         self.config = config
 
+    @classmethod
+    def create(cls, configs: Config):
+        try:
+            return cls(configs)
+        except Exception as e:
+            e.add_note(
+                "\n"
+                + (
+                    cls.__doc__
+                    or f"There was an issue initialising {cls}. Please doublecheck your configuration."
+                )
+            )
+            raise
+
     @abstractmethod
     async def rewrite(self, original_query: list[str]) -> list[str]:
         raise NotImplementedError
diff --git a/src/vectorcode/rewriter/openai.py b/src/vectorcode/rewriter/openai.py
@@ -17,6 +17,16 @@ class _NewQuery(BaseModel):
 
 
 class OpenAIRewriter(RewriterBase):
+    """
+    OpenAIRewriter class is an adapter for openai-compatible API services that provides
+    structured output support. The `rewriter_params` dictionary accepts 3 keys:
+        - `client_kwargs`: dictionary, containing arguments that are passed to `openai.Client`.
+          See https://github.com/openai/openai-python/blob/67997a4ec1ebcdf8e740afb0d0b2e37897657bde/src/openai/_client.py#L80;
+        - `completion_kwargs`: dictionary, containing arguments that are passed to `openai.Client.beta.chat.completions.parse`.
+          See https://github.com/openai/openai-python/blob/main/helpers.md#structured-outputs-parsing-helpers.
+        - `system_prompt`: string, the system prompt that contains the guidelines for rewriting the query.
+    """
+
     def __init__(self, config: Config) -> None:
         super().__init__(config)
         self.client = openai.Client(
@@ -65,7 +75,7 @@ async def rewrite(self, original_query: list[str]):
             )
             if comp is None or len(comp.choices) == 0:
                 logger.info(
-                    "Recieved no rewritten query. Fallingback to original_query."
+                    "Received no rewritten query. Fallingback to original_query."
                 )
                 return original_query
             choice = comp.choices[0].message
diff --git a/tests/rewriter/test_rewriter.py b/tests/rewriter/test_rewriter.py
@@ -21,7 +21,7 @@ def test_get_openai_rewriter():
         patch("vectorcode.rewriter.issubclass") as mock_issubclass,
     ):
         mock_rewriter = MagicMock()
-        mock_openai_cls.return_value = mock_rewriter
+        mock_openai_cls.create.return_value = mock_rewriter
         mock_issubclass.return_value = True
         assert get_rewriter(Config(rewriter="OpenAIRewriter")) == mock_rewriter