diff --git a/.github/workflows/faiss.yml b/.github/workflows/faiss.yml new file mode 100644 index 0000000000..f78bf2b586 --- /dev/null +++ b/.github/workflows/faiss.yml @@ -0,0 +1,76 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / faiss + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "integrations/faiss/**" + - "!integrations/faiss/*.md" + - ".github/workflows/faiss.yml" + +concurrency: + group: faiss-${{ github.head_ref }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + +defaults: + run: + working-directory: integrations/faiss + +jobs: + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + # FAISS wheels are most reliable on Linux in CI. + os: [ubuntu-latest] #[ubuntu-latest, windows-latest, macos-latest] + python-version: ["3.10", "3.13"] + + steps: + - uses: actions/checkout@v6 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install hatch "virtualenv<21.0.0" + + - name: Lint + if: matrix.python-version == '3.10' && runner.os == 'Linux' + run: hatch run fmt-check && hatch run test:types + + - name: Run tests + run: hatch run test:cov-retry + + - name: Run unit tests with lowest direct dependencies + if: matrix.python-version == '3.10' && runner.os == 'Linux' + run: | + hatch env prune + hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt + hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt + hatch run test:unit + + - name: Nightly - run unit tests with Haystack main branch + if: github.event_name == 'schedule' + run: | + hatch env prune + hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main + hatch run test:unit + + - name: Send event to Datadog for nightly failures + if: failure() && github.event_name == 'schedule' + uses: ./.github/actions/send_failure + with: + title: | + Core integrations nightly tests failure: ${{ github.workflow }} + api-key: ${{ secrets.CORE_DATADOG_API_KEY }} diff --git a/integrations/faiss/pyproject.toml b/integrations/faiss/pyproject.toml index 5646508a95..016f26822f 100644 --- a/integrations/faiss/pyproject.toml +++ b/integrations/faiss/pyproject.toml @@ -24,7 +24,7 @@ classifiers = [ dependencies = [ "haystack-ai>=2.24.0", "faiss-cpu>=1.8.0", - "numpy", + "numpy>=1.22,<2; python_version < '3.13'", ] [project.urls] diff --git a/integrations/faiss/src/haystack_integrations/components/retrievers/faiss/embedding_retriever.py b/integrations/faiss/src/haystack_integrations/components/retrievers/faiss/embedding_retriever.py index fb918cf383..fde48d7bf1 100644 --- a/integrations/faiss/src/haystack_integrations/components/retrievers/faiss/embedding_retriever.py +++ b/integrations/faiss/src/haystack_integrations/components/retrievers/faiss/embedding_retriever.py @@ -50,7 +50,7 @@ class FAISSEmbeddingRetriever: assert res["retriever"]["documents"][0].content == "There are over 7,000 languages spoken around the world today." ``` - """ # noqa: E501 + """ # noqa: E501 def __init__( self, diff --git a/integrations/faiss/src/haystack_integrations/components/retrievers/faiss/py.typed b/integrations/faiss/src/haystack_integrations/components/retrievers/faiss/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/faiss/src/haystack_integrations/document_stores/faiss/document_store.py b/integrations/faiss/src/haystack_integrations/document_stores/faiss/document_store.py index fc78f19cd6..050a4929aa 100644 --- a/integrations/faiss/src/haystack_integrations/document_stores/faiss/document_store.py +++ b/integrations/faiss/src/haystack_integrations/document_stores/faiss/document_store.py @@ -9,7 +9,7 @@ from pathlib import Path from typing import Any -import faiss +import faiss # type: ignore[import-untyped] import numpy as np from haystack import default_from_dict, default_to_dict from haystack.dataclasses import Document @@ -40,6 +40,8 @@ def __init__( :param index_path: Path to save/load the index and documents. If None, the store is in-memory only. :param index_string: The FAISS index factory string. Default is "Flat". :param embedding_dim: The dimension of the embeddings. Default is 768. + :raises DocumentStoreError: If the FAISS index cannot be initialized. + :raises ValueError: If `index_path` points to a missing `.faiss` file when loading persisted data. """ self.index_path = index_path self.embedding_dim = embedding_dim @@ -68,6 +70,13 @@ def _create_new_index(self): msg = f"Could not create FAISS index with factory string '{self.index_string}': {e}" raise DocumentStoreError(msg) from e + def _get_index_or_raise(self) -> Any: + """Return the FAISS index or raise if it is unexpectedly missing.""" + if self.index is None: + msg = "FAISS index has not been initialized." + raise DocumentStoreError(msg) + return self.index + def count_documents(self) -> int: """ Returns the number of documents in the store. @@ -80,6 +89,7 @@ def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Docume :param filters: A dictionary of filters to apply. :return: A list of matching Documents. + :raises FilterError: If the filter structure is invalid. """ if not filters: return list(self.documents.values()) @@ -120,6 +130,9 @@ def write_documents(self, documents: list[Document], policy: DuplicatePolicy = D :param documents: The list of documents to write. :param policy: The policy to handle duplicate documents. :return: The number of documents written. + :raises ValueError: If `documents` is not an iterable of `Document` objects. + :raises DuplicateDocumentError: If a duplicate document is found and `policy` is `DuplicatePolicy.FAIL`. + :raises DocumentStoreError: If the FAISS index is unexpectedly unavailable when adding embeddings. """ if not isinstance(documents, Iterable) or isinstance(documents, (str, bytes)): msg = "param 'documents' must contain an iterable of objects of type Document." @@ -175,13 +188,16 @@ def write_documents(self, documents: list[Document], policy: DuplicatePolicy = D if vectors_to_add: vectors = np.array(vectors_to_add, dtype="float32") ids = np.array(ids_to_add_to_index, dtype="int64") - self.index.add_with_ids(vectors, ids) + index = self._get_index_or_raise() + index.add_with_ids(vectors, ids) return docs_written def delete_documents(self, document_ids: list[str]) -> None: """ Deletes documents from the store. + + :raises DocumentStoreError: If the FAISS index is unexpectedly unavailable when removing embeddings. """ if not document_ids: return @@ -197,9 +213,10 @@ def delete_documents(self, document_ids: list[str]) -> None: del self.id_map[int_id] ids_to_remove_from_index.append(int_id) - if ids_to_remove_from_index and self.index.ntotal > 0: + index = self._get_index_or_raise() + if ids_to_remove_from_index and index.ntotal > 0: ids_array = np.array(ids_to_remove_from_index, dtype="int64") - self.index.remove_ids(ids_array) + index.remove_ids(ids_array) def delete_all_documents(self) -> None: """ @@ -221,6 +238,7 @@ def search( :param top_k: The number of results to return. :param filters: Filters to apply. :return: A list of matching Documents. + :raises FilterError: If the filter structure is invalid. """ if not self.index or self.index.ntotal == 0: return [] @@ -301,6 +319,9 @@ def _check_condition(self, doc: Document, condition: dict[str, Any]) -> bool: msg = "Missing 'field' in filter condition" raise FilterError(msg) field = condition.get("field") + if not isinstance(field, str): + msg = "'field' in filter condition must be a string" + raise FilterError(msg) if "value" not in condition: msg = "Missing 'value' in filter condition" raise FilterError(msg) @@ -370,6 +391,8 @@ def delete_by_filter(self, filters: dict[str, Any]) -> int: :param filters: A dictionary of filters to apply to find documents to delete. :returns: The number of documents deleted. + :raises FilterError: If the filter structure is invalid. + :raises DocumentStoreError: If the FAISS index is unexpectedly unavailable when removing embeddings. """ docs_to_delete = self.filter_documents(filters) ids = [doc.id for doc in docs_to_delete] @@ -382,6 +405,7 @@ def count_documents_by_filter(self, filters: dict[str, Any]) -> int: :param filters: A dictionary of filters to apply. :returns: The number of matching documents. + :raises FilterError: If the filter structure is invalid. """ return len(self.filter_documents(filters)) @@ -395,6 +419,7 @@ def update_by_filter(self, filters: dict[str, Any], meta: dict[str, Any]) -> int :param filters: A dictionary of filters to apply to find documents to update. :param meta: A dictionary of metadata key-value pairs to update in the matching documents. :returns: The number of documents updated. + :raises FilterError: If the filter structure is invalid. """ docs_to_update = self.filter_documents(filters) for doc in docs_to_update: @@ -505,9 +530,11 @@ def from_dict(cls, data: dict[str, Any]) -> "FAISSDocumentStore": def save(self, index_path: str | Path) -> None: """ Saves the index and documents to disk. + + :raises DocumentStoreError: If the FAISS index is unexpectedly unavailable. """ path = Path(index_path) - faiss.write_index(self.index, str(path.with_suffix(".faiss"))) + faiss.write_index(self._get_index_or_raise(), str(path.with_suffix(".faiss"))) # Save documents and ID mapping data = { @@ -523,6 +550,8 @@ def save(self, index_path: str | Path) -> None: def load(self, index_path: str | Path) -> None: """ Loads the index and documents from disk. + + :raises ValueError: If the `.faiss` file does not exist. """ path = Path(index_path) if not path.with_suffix(".faiss").exists():