feat(cli): Refactor vectorise command to use DB adapter layer

Zhe Yu · Zhe Yu · commit af1454d62817 · 2025-09-14T13:11:48.000+08:00
diff --git a/src/vectorcode/cli_utils.py b/src/vectorcode/cli_utils.py
@@ -708,27 +708,29 @@ def from_path(cls, spec_path: str, project_root: Optional[str] = None):
         return cls(spec_path, base_dir)
 
     def __init__(self, spec: str | GitIgnoreSpec, base_dir: str = "."):
+        self.spec: GitIgnoreSpec
         if isinstance(spec, str):
             with open(spec) as fin:
                 self.spec = GitIgnoreSpec.from_lines(
                     (i.strip() for i in fin.readlines())
                 )
         else:
             self.spec = spec
-        self.base_dir = base_dir
+        self.base_dir = Path(base_dir).resolve()
+
+    def match_file(self, path: str, negated: bool = False) -> bool:
+        if self.base_dir in Path(path).resolve().parents:
+            matched = self.spec.match_file(os.path.relpath(path, self.base_dir))
+            if negated:
+                matched = not matched
+            return matched
+        return True
 
     def match(
         self, paths: Iterable[str], negated: bool = False
     ) -> Generator[str, None, None]:
         # get paths relative to `base_dir`
 
-        base = Path(self.base_dir).resolve()
         for p in paths:
-            if base in Path(p).resolve().parents:
-                should_yield = self.spec.match_file(os.path.relpath(p, self.base_dir))
-                if negated:
-                    should_yield = not should_yield
-                if should_yield:
-                    yield p
-            else:
+            if self.match_file(p, negated):
                 yield p
diff --git a/src/vectorcode/database/base.py b/src/vectorcode/database/base.py
@@ -180,6 +180,10 @@ def get_embedding(self, texts: str | list[str]) -> list[NDArray]:
         """
         if isinstance(texts, str):
             texts = [texts]
+        if len(texts) == 0:
+            return []
+        texts = [i for i in texts]
+        logger.debug(f"Getting embeddings for {texts}")
         embeddings = get_embedding_function(self._configs)(texts)
         if self._configs.embedding_dims:
             embeddings = [e[: self._configs.embedding_dims] for e in embeddings]
diff --git a/src/vectorcode/database/chroma0.py b/src/vectorcode/database/chroma0.py
@@ -363,6 +363,8 @@ async def vectorise(
 
         chunks = tuple(chunker.chunk(file_path))
         embeddings = self.get_embedding(list(i.text for i in chunks))
+        if len(embeddings) == 0:
+            return VectoriseStats(skipped=1)
 
         file_hash = hash_file(file_path)
 
@@ -501,7 +503,9 @@ async def delete(self) -> int:
         ]
         files_in_collection = set(
             str(expand_path(i.path, True))
-            for i in (await self.list_collection_content(ResultType.document)).files
+            for i in (
+                await self.list_collection_content(what=ResultType.document)
+            ).files
         )
 
         rm_paths = {
diff --git a/src/vectorcode/subcommands/vectorise/__init__.py b/src/vectorcode/subcommands/vectorise/__init__.py
@@ -28,7 +28,9 @@
 )
 from vectorcode.database import get_database_connector
 from vectorcode.database.base import DatabaseConnectorBase
-from vectorcode.database.types import VectoriseStats
+from vectorcode.database.errors import CollectionNotFoundError
+from vectorcode.database.types import ResultType, VectoriseStats
+from vectorcode.subcommands.vectorise.filter import FilterManager
 
 logger = logging.getLogger(name=__name__)
 
@@ -261,19 +263,31 @@ async def vectorise(configs: Config) -> int:
         include_hidden=configs.include_hidden,
     )
 
-    # TODO: check file hashes
+    filters = FilterManager()
+
+    try:
+        collection_files = (
+            await database.list_collection_content(what=ResultType.document)
+        ).files
+
+        existing_hashes = set(i.sha256 for i in collection_files)
+    except CollectionNotFoundError:
+        existing_hashes = set()
 
     if not configs.force:
         for spec_path in find_exclude_specs(configs):
+            # filter by gitignore/vectorcode.exclude
             if os.path.isfile(spec_path):
                 logger.info(f"Loading ignore specs from {spec_path}.")
-                files = exclude_paths_by_spec(
-                    (str(i) for i in files), spec_path, str(configs.project_root)
-                )
-                logger.debug(f"Files after excluding: {files}")
+                spec = SpecResolver.from_path(spec_path)
+                filters.add_filter(lambda x: spec.match_file(x, True))
+
+        # filter by sha256
+        filters.add_filter(lambda x: hash_file(x) not in existing_hashes)
     else:  # pragma: nocover
         logger.info("Ignoring exclude specs.")
 
+    files = list(filters(files))
     stats = VectoriseStats()
     stats_lock = Lock()
     semaphore = asyncio.Semaphore(os.cpu_count() or 1)
diff --git a/src/vectorcode/subcommands/vectorise/filter.py b/src/vectorcode/subcommands/vectorise/filter.py
@@ -0,0 +1,46 @@
+import logging
+import os
+import sys
+from typing import Callable, Iterable, Self, Sequence
+
+logger = logging.getLogger(name=__name__)
+
+FileFilter = Callable[[str], bool]
+
+
+class FilterManager:
+    def __init__(self, from_filters: Sequence[FileFilter] | None = None) -> None:
+        self._filters: list[FileFilter] = []
+        if from_filters:
+            self._filters.extend(from_filters)
+
+    def add_filter(self, f: FileFilter = lambda x: bool(x)) -> Self:
+        self._filters.append(f)
+        return self
+
+    def _has_debugging(self):  # pragma: nocover
+        """
+        Iterators are difficult to debug.
+        Use this function to decide whether we should convert iterators to tuples
+        to make debugging easier.
+        """
+        return (
+            sys.gettrace() is not None
+            or os.environ.get("VECTORCODE_LOG_LEVEL") is not None
+        )
+
+    def __call__(self, files: Iterable[str]) -> Iterable[str]:
+        if self._has_debugging():  # pragma: nocover
+            files = tuple(files)
+            logger.debug(
+                f"Applying the following filters: {list(i.__name__ for i in self._filters)} to the following files ({len(files)}): {files}"
+            )
+
+        for f in self._filters:
+            files = filter(f, files)
+
+            if self._has_debugging():  # pragma: nocover
+                files = tuple(files)
+                logger.debug(f"{f.__name__} remaining items ({len(files)}): {files}")
+
+        return files