diff --git a/.github/labeler.yml b/.github/labeler.yml index 349edf670c..4de09686d4 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -9,6 +9,11 @@ integration:amazon-bedrock: - any-glob-to-any-file: "integrations/amazon_bedrock/**/*" - any-glob-to-any-file: ".github/workflows/amazon_bedrock.yml" +integration:amazon-s3-vectors: + - changed-files: + - any-glob-to-any-file: "integrations/amazon_s3_vectors/**/*" + - any-glob-to-any-file: ".github/workflows/amazon_s3_vectors.yml" + integration:amazon-sagemaker: - changed-files: - any-glob-to-any-file: "integrations/amazon_sagemaker/**/*" diff --git a/.github/workflows/CI_coverage_comment.yml b/.github/workflows/CI_coverage_comment.yml index e4d682b7cf..211c8504aa 100644 --- a/.github/workflows/CI_coverage_comment.yml +++ b/.github/workflows/CI_coverage_comment.yml @@ -6,6 +6,7 @@ on: - "Test / aimlapi" - "Test / amazon-bedrock" - "Test / amazon-sagemaker" + - "Test / amazon-s3-vectors" - "Test / anthropic" - "Test / arcadedb" - "Test / astra" diff --git a/.github/workflows/amazon_s3_vectors.yml b/.github/workflows/amazon_s3_vectors.yml new file mode 100644 index 0000000000..1fa282e190 --- /dev/null +++ b/.github/workflows/amazon_s3_vectors.yml @@ -0,0 +1,139 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / amazon-s3-vectors + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "integrations/amazon_s3_vectors/**" + - "!integrations/amazon_s3_vectors/*.md" + - ".github/workflows/amazon_s3_vectors.yml" + push: + branches: + - main + paths: + - "integrations/amazon_s3_vectors/**" + - "!integrations/amazon_s3_vectors/*.md" + - ".github/workflows/amazon_s3_vectors.yml" + +defaults: + run: + working-directory: integrations/amazon_s3_vectors + +concurrency: + group: amazon_s3_vectors-${{ github.head_ref || github.sha }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + TEST_MATRIX_OS: '["ubuntu-latest", "windows-latest", "macos-latest"]' + TEST_MATRIX_PYTHON: '["3.10", "3.14"]' + +jobs: + compute-test-matrix: + runs-on: ubuntu-slim + defaults: + run: + working-directory: . + outputs: + os: ${{ steps.set.outputs.os }} + python-version: ${{ steps.set.outputs.python-version }} + steps: + - id: set + run: | + echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> "$GITHUB_OUTPUT" + echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> "$GITHUB_OUTPUT" + + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + needs: compute-test-matrix + permissions: + contents: write + pull-requests: write + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: ${{ fromJSON(needs.compute-test-matrix.outputs.os) }} + python-version: ${{ fromJSON(needs.compute-test-matrix.outputs.python-version) }} + + steps: + - name: Support longpaths + if: matrix.os == 'windows-latest' + working-directory: . + run: git config --system core.longpaths true + + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install --upgrade hatch + - name: Lint + if: matrix.python-version == '3.10' && runner.os == 'Linux' + run: hatch run fmt-check && hatch run test:types + + - name: Run unit tests + run: hatch run test:unit-cov-retry + + # On PR: posts coverage comment (directly on same-repo PRs; via artifact for fork PRs). On push to main: stores coverage baseline on data branch. + - name: Store unit tests coverage + id: coverage_comment + if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name != 'schedule' + uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40 + with: + GITHUB_TOKEN: ${{ github.token }} + COVERAGE_PATH: integrations/amazon_s3_vectors + SUBPROJECT_ID: amazon_s3_vectors + MINIMUM_GREEN: 90 + MINIMUM_ORANGE: 60 + + - name: Upload coverage comment to be posted + if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name == 'pull_request' && steps.coverage_comment.outputs.COMMENT_FILE_WRITTEN == 'true' + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: coverage-comment-amazon_s3_vectors + path: python-coverage-comment-action-amazon_s3_vectors.txt + + - name: Run integration tests + run: hatch run test:integration-cov-append-retry + + - name: Store combined coverage + if: github.event_name == 'push' + uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40 + with: + GITHUB_TOKEN: ${{ github.token }} + COVERAGE_PATH: integrations/amazon_s3_vectors + SUBPROJECT_ID: amazon_s3_vectors-combined + MINIMUM_GREEN: 90 + MINIMUM_ORANGE: 60 + + - name: Run unit tests with lowest direct dependencies + if: github.event_name != 'push' + run: | + hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt + hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt + hatch run test:unit + + - name: Nightly - run unit tests with Haystack main branch + if: github.event_name == 'schedule' + run: | + hatch env prune + hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main + hatch run test:unit + + + notify-slack-on-failure: + needs: run + if: failure() && github.event_name == 'schedule' + runs-on: ubuntu-slim + steps: + - uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1 + with: + slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }} diff --git a/README.md b/README.md index 42891b8360..d3691a3590 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta |-------------------------------------------------------------------------|-----------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------|---------------------| | [aimlapi-haystack](integrations/aimlapi/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/aimlapi-haystack.svg)](https://pypi.org/project/aimlapi-haystack) | [![Test / aimlapi](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/aimlapi.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/aimlapi.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-aimlapi/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-aimlapi/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-aimlapi-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-aimlapi-combined/htmlcov/index.html) | | [amazon-bedrock-haystack](integrations/amazon_bedrock/) | Embedder, Generator, Ranker, Downloader | [![PyPI - Version](https://img.shields.io/pypi/v/amazon-bedrock-haystack.svg)](https://pypi.org/project/amazon-bedrock-haystack) | [![Test / amazon_bedrock](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_bedrock.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_bedrock.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-amazon_bedrock/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-amazon_bedrock/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-amazon_bedrock-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-amazon_bedrock-combined/htmlcov/index.html) | +| [amazon-s3-vectors-haystack](integrations/amazon_s3_vectors/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/amazon-s3-vectors-haystack.svg)](https://pypi.org/project/amazon-s3-vectors-haystack) | [![Test / amazon_s3_vectors](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_s3_vectors.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_s3_vectors.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-amazon_s3_vectors/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-amazon_s3_vectors/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-amazon_s3_vectors-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-amazon_s3_vectors-combined/htmlcov/index.html) | | [amazon-sagemaker-haystack](integrations/amazon_sagemaker/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/amazon-sagemaker-haystack.svg)](https://pypi.org/project/amazon-sagemaker-haystack) | [![Test / amazon_sagemaker](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_sagemaker.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_sagemaker.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-amazon_sagemaker/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-amazon_sagemaker/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-amazon_sagemaker-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-amazon_sagemaker-combined/htmlcov/index.html) | | [anthropic-haystack](integrations/anthropic/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/anthropic-haystack.svg)](https://pypi.org/project/anthropic-haystack) | [![Test / anthropic](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/anthropic.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/anthropic.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-anthropic/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-anthropic/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-anthropic-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-anthropic-combined/htmlcov/index.html) | | [arcadedb-haystack](integrations/arcadedb/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/arcadedb-haystack.svg)](https://pypi.org/project/arcadedb-haystack) | [![Test / arcadedb](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/arcadedb.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/arcadedb.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-arcadedb/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-arcadedb/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-arcadedb-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-arcadedb-combined/htmlcov/index.html) | diff --git a/integrations/amazon_s3_vectors/LICENSE.txt b/integrations/amazon_s3_vectors/LICENSE.txt new file mode 100644 index 0000000000..6134ab324f --- /dev/null +++ b/integrations/amazon_s3_vectors/LICENSE.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2023-present deepset GmbH + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/integrations/amazon_s3_vectors/README.md b/integrations/amazon_s3_vectors/README.md new file mode 100644 index 0000000000..8fd50ff2d1 --- /dev/null +++ b/integrations/amazon_s3_vectors/README.md @@ -0,0 +1,207 @@ +# amazon-s3-vectors-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/amazon-s3-vectors-haystack.svg)](https://pypi.org/project/amazon-s3-vectors-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/amazon-s3-vectors-haystack.svg)](https://pypi.org/project/amazon-s3-vectors-haystack) + +--- + +A [Haystack](https://haystack.deepset.ai/) integration for [Amazon S3 Vectors](https://aws.amazon.com/s3/features/vectors/), providing a Document Store and Embedding Retriever backed by native vector storage in Amazon S3. + +## Installation + +```bash +pip install amazon-s3-vectors-haystack +``` + +## Usage + +### Document Store + +```python +from haystack.dataclasses import Document +from haystack.document_stores.types import DuplicatePolicy +from haystack_integrations.document_stores.amazon_s3_vectors import S3VectorsDocumentStore + +document_store = S3VectorsDocumentStore( + vector_bucket_name="my-vectors", + index_name="my-index", + dimension=768, + distance_metric="cosine", # or "euclidean" + region_name="us-east-1", +) + +# Write documents (embeddings are required) +docs = [ + Document(id="1", content="First document", embedding=[0.1] * 768, meta={"category": "news"}), + Document(id="2", content="Second document", embedding=[0.2] * 768, meta={"category": "sports"}), +] +document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + +# Count documents +print(document_store.count_documents()) + +# Delete documents +document_store.delete_documents(["1", "2"]) +``` + +### Embedding Retriever in a Pipeline + +```python +from haystack import Pipeline +from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder +from haystack.document_stores.types import DuplicatePolicy +from haystack_integrations.components.retrievers.amazon_s3_vectors import S3VectorsEmbeddingRetriever +from haystack_integrations.document_stores.amazon_s3_vectors import S3VectorsDocumentStore + +document_store = S3VectorsDocumentStore( + vector_bucket_name="my-vectors", + index_name="my-index", + dimension=768, +) + +# Index documents +doc_embedder = SentenceTransformersDocumentEmbedder() +doc_embedder.warm_up() +# ... embed and write documents ... + +# Query pipeline +pipeline = Pipeline() +pipeline.add_component("embedder", SentenceTransformersTextEmbedder()) +pipeline.add_component("retriever", S3VectorsEmbeddingRetriever(document_store=document_store, top_k=5)) +pipeline.connect("embedder.embedding", "retriever.query_embedding") + +result = pipeline.run({"embedder": {"text": "What is the latest news?"}}) +print(result["retriever"]["documents"]) +``` + +### Filtering + +The retriever supports Haystack metadata filters, which are converted to S3 Vectors filter syntax: + +```python +# Filter during retrieval +result = pipeline.run({ + "embedder": {"text": "sports news"}, + "retriever": { + "filters": { + "operator": "AND", + "conditions": [ + {"field": "meta.category", "operator": "==", "value": "sports"}, + {"field": "meta.year", "operator": ">=", "value": 2024}, + ], + } + }, +}) +``` + +**Supported filter operators:** `==`, `!=`, `>`, `>=`, `<`, `<=`, `in`, `not in`, `AND`, `OR` + +### AWS Authentication + +The document store uses the standard boto3 credential chain by default. You can also pass credentials explicitly: + +```python +from haystack.utils.auth import Secret + +document_store = S3VectorsDocumentStore( + vector_bucket_name="my-vectors", + index_name="my-index", + dimension=768, + region_name="us-east-1", + aws_access_key_id=Secret.from_env_var("AWS_ACCESS_KEY_ID"), + aws_secret_access_key=Secret.from_env_var("AWS_SECRET_ACCESS_KEY"), +) +``` + +## Configuration Reference + +| Parameter | Type | Default | Description | +|---|---|---|---| +| `vector_bucket_name` | `str` | *required* | Name of the S3 vector bucket | +| `index_name` | `str` | *required* | Name of the vector index within the bucket | +| `dimension` | `int` | *required* | Dimensionality of the embeddings (e.g. 768, 1536) | +| `distance_metric` | `str` | `"cosine"` | `"cosine"` or `"euclidean"` | +| `region_name` | `str \| None` | `None` | AWS region (uses default from env if not set) | +| `aws_access_key_id` | `Secret \| None` | `None` | AWS access key ID | +| `aws_secret_access_key` | `Secret \| None` | `None` | AWS secret access key | +| `aws_session_token` | `Secret \| None` | `None` | AWS session token (for temporary credentials) | +| `create_bucket_and_index` | `bool` | `True` | Auto-create bucket and index if they don't exist | +| `non_filterable_metadata_keys` | `list[str] \| None` | `None` | Additional metadata keys to mark as non-filterable | + +## Known Limitations & Considerations + +### No Native Document Count API +S3 Vectors does not provide a dedicated count endpoint. `count_documents()` paginates through all +vector keys, which can be slow for large indexes (millions of vectors). + +### `filter_documents()` Is Expensive +S3 Vectors only supports metadata filtering **during vector similarity queries** — there is no +standalone "list documents matching filter" API. As a result, `filter_documents()` must: +1. List all vectors with their data and metadata (paginated) +2. Apply filters client-side in memory + +**For filtered retrieval, always prefer `S3VectorsEmbeddingRetriever` with filters**, which uses +the native `query_vectors` API with server-side filtering. + +### Embedding Required +Every document written to the store **must** have an embedding. Documents without embeddings +will be rejected. This is a fundamental constraint of S3 Vectors as a pure vector store. + +### No Keyword / BM25 Retrieval +S3 Vectors only supports dense vector similarity search. There is no keyword or BM25 search +capability. If you need hybrid search, consider pairing this with Amazon OpenSearch. + +### Vector Data Type +Only `float32` vectors are supported. Higher-precision values are automatically downcast. + +### Metadata Size Limits +- **Total metadata per vector: 40 KB** (filterable + non-filterable combined) +- **Filterable metadata per vector: 2 KB** — user `meta` fields used in filters must fit in this budget +- **Non-filterable metadata keys per index: 10** — the integration reserves 4 internal keys + (`_content`, `_blob_data`, `_blob_meta`, `_blob_mime_type`), leaving 6 for user-defined keys +- Keys are set at index creation and **cannot be changed later** + +Large content (e.g. full document text) is stored as non-filterable metadata automatically. +If you store additional large metadata fields, declare them via `non_filterable_metadata_keys`. + +### Strict API Limits +- `put_vectors`: up to 500 vectors per call (handled automatically by the integration) +- `get_vectors`: up to 100 keys per call (handled automatically) +- `delete_vectors`: up to 500 keys per call (handled automatically) +- **`query_vectors`: maximum 100 results per query** — this is the hard cap on `top_k`. + If you need more than 100 results, you must implement pagination or use a different store. +- Combined PutVectors + DeleteVectors: up to 1,000 requests/second per index + +### Distance Metrics and Scoring +Only `cosine` and `euclidean` are supported. The metric is set at index creation time and cannot +be changed afterward. + +S3 Vectors returns raw **distances** (lower = more similar). The integration converts these to +Haystack-convention **scores** (higher = more similar): +- **Cosine:** `score = 1.0 - distance` (1.0 = identical, 0.0 = orthogonal) +- **Euclidean:** `score = -distance` (0.0 = identical, more negative = further) + +### No Embeddings Returned from Queries +The `query_vectors` API does not support returning the stored vector data alongside results. +Documents retrieved via `S3VectorsEmbeddingRetriever` will have `embedding=None`. If you need +the embedding vectors, use `filter_documents()` or fetch them separately via the boto3 client. + +### Eventual Consistency +Newly written vectors may not be immediately visible in query results. S3 Vectors provides +eventual consistency for write-then-read operations. + +## Running Tests + +```bash +cd integrations/amazon_s3_vectors + +# Unit tests (no AWS credentials required) +hatch run test:unit + +# Integration tests (requires AWS credentials and S3 Vectors access) +hatch run test:integration +``` + +## License + +`amazon-s3-vectors-haystack` is distributed under the terms of the [Apache-2.0](https://spdx.org/licenses/Apache-2.0.html) license. diff --git a/integrations/amazon_s3_vectors/examples/example.py b/integrations/amazon_s3_vectors/examples/example.py new file mode 100644 index 0000000000..67d3a67c20 --- /dev/null +++ b/integrations/amazon_s3_vectors/examples/example.py @@ -0,0 +1,108 @@ +# Amazon S3 Vectors — Haystack Integration Example +# +# A serverless vector store on AWS: no database to provision, no cluster to manage. +# Just a bucket name and AWS credentials. +# +# This example indexes documents, runs unfiltered and filtered queries, and cleans up. +# +# Prerequisites: +# pip install amazon-s3-vectors-haystack "sentence-transformers>=2.2.0" +# AWS credentials configured (env vars, ~/.aws/credentials, or IAM role) + +import time + +from haystack import Document, Pipeline +from haystack.components.embedders import SentenceTransformersDocumentEmbedder, SentenceTransformersTextEmbedder +from haystack.components.writers import DocumentWriter +from haystack.document_stores.types import DuplicatePolicy + +from haystack_integrations.components.retrievers.amazon_s3_vectors import S3VectorsEmbeddingRetriever +from haystack_integrations.document_stores.amazon_s3_vectors import S3VectorsDocumentStore + +MODEL = "sentence-transformers/all-MiniLM-L6-v2" + +# --- Create the document store (bucket + index are created automatically) --- +document_store = S3VectorsDocumentStore( + vector_bucket_name="haystack-example", + index_name="products", + dimension=384, # all-MiniLM-L6-v2 produces 384-dim embeddings + distance_metric="cosine", + region_name="us-east-1", +) + +# --- Index documents with metadata --- +products = [ + Document( + content="Lightweight running shoes with breathable mesh", + meta={"category": "shoes", "price": 89.99, "in_stock": True}, + ), + Document( + content="Waterproof hiking boots with ankle support", + meta={"category": "shoes", "price": 149.99, "in_stock": True}, + ), + Document( + content="Classic leather dress shoes", + meta={"category": "shoes", "price": 199.99, "in_stock": False}, + ), + Document( + content="Insulated winter jacket with down filling", + meta={"category": "jackets", "price": 249.99, "in_stock": True}, + ), + Document( + content="Lightweight windbreaker for running", + meta={"category": "jackets", "price": 79.99, "in_stock": True}, + ), +] + +indexing = Pipeline() +indexing.add_component("embedder", SentenceTransformersDocumentEmbedder(model=MODEL)) +indexing.add_component("writer", DocumentWriter(document_store=document_store, policy=DuplicatePolicy.OVERWRITE)) +indexing.connect("embedder", "writer") + +print("Indexing products...") +indexing.run({"embedder": {"documents": products}}) +print(f"Indexed {document_store.count_documents()} products.\n") + +# --- Build the query pipeline --- +querying = Pipeline() +querying.add_component("embedder", SentenceTransformersTextEmbedder(model=MODEL)) +querying.add_component("retriever", S3VectorsEmbeddingRetriever(document_store=document_store, top_k=5)) +querying.connect("embedder", "retriever") + + +def search(query: str, filters: dict | None = None) -> None: + print(f"Query: '{query}'" + (f" Filter: {filters}" if filters else "")) + start = time.time() + result = querying.run({"embedder": {"text": query}, "retriever": {"filters": filters}}) + elapsed = (time.time() - start) * 1000 + for i, doc in enumerate(result["retriever"]["documents"], 1): + meta = doc.meta + print(f" {i}. [{doc.score:.3f}] {doc.content}") + print(f" category={meta['category']} price=${meta['price']} in_stock={meta['in_stock']}") + print(f" ({elapsed:.0f}ms)\n") + + +# Unfiltered: returns shoes AND jackets +search("something for running") + +# Filtered to shoes only — server-side, inside the vector search +search("something for running", filters={"field": "meta.category", "operator": "==", "value": "shoes"}) + +# Combined filter: in-stock AND under $100 +search( + "something for running", + filters={ + "operator": "AND", + "conditions": [ + {"field": "meta.in_stock", "operator": "==", "value": True}, + {"field": "meta.price", "operator": "<", "value": 100.0}, + ], + }, +) + +# --- Cleanup --- +document_store.delete_documents([doc.id for doc in products]) +client = document_store._get_client() +client.delete_index(vectorBucketName="haystack-example", indexName="products") +client.delete_vector_bucket(vectorBucketName="haystack-example") +print("Cleaned up all AWS resources.") diff --git a/integrations/amazon_s3_vectors/pydoc/config_docusaurus.yml b/integrations/amazon_s3_vectors/pydoc/config_docusaurus.yml new file mode 100644 index 0000000000..4320722107 --- /dev/null +++ b/integrations/amazon_s3_vectors/pydoc/config_docusaurus.yml @@ -0,0 +1,14 @@ +loaders: + - modules: + - haystack_integrations.components.retrievers.amazon_s3_vectors.embedding_retriever + - haystack_integrations.document_stores.amazon_s3_vectors.document_store + search_path: [../src] +processors: + - type: filter + documented_only: true + skip_empty_modules: true +renderer: + description: Amazon S3 Vectors integration for Haystack + id: integrations-amazon_s3_vectors + filename: amazon_s3_vectors.md + title: Amazon S3 Vectors diff --git a/integrations/amazon_s3_vectors/pyproject.toml b/integrations/amazon_s3_vectors/pyproject.toml new file mode 100644 index 0000000000..c44b588f4b --- /dev/null +++ b/integrations/amazon_s3_vectors/pyproject.toml @@ -0,0 +1,171 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "amazon-s3-vectors-haystack" +dynamic = ["version"] +description = "Haystack integration for amazon_s3_vectors" +readme = "README.md" +requires-python = ">=3.10" +license = "Apache-2.0" +keywords = ["aws", "s3", "vectors", "document-store", "haystack", "vector-search"] +authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }] +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = ["haystack-ai>=2.26.1", "boto3>=1.42.0"] + +[project.urls] +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/amazon_s3_vectors#readme" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/amazon_s3_vectors" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + +[tool.hatch.version] +source = "vcs" +tag-pattern = 'integrations\/amazon_s3_vectors-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/amazon_s3_vectors-v[0-9]*"' + +[tool.hatch.envs.default] +installer = "uv" +dependencies = ["haystack-pydoc-tools", "ruff"] + +[tool.hatch.envs.default.scripts] +docs = ["haystack-pydoc pydoc/config_docusaurus.yml"] +fmt = "ruff check --fix {args}; ruff format {args}" +fmt-check = "ruff check {args} && ruff format --check {args}" + +[tool.hatch.envs.test] +dependencies = [ + "pytest", + "pytest-asyncio", + "pytest-cov", + "pytest-rerunfailures", + "mypy", + "pip", +] + +[tool.hatch.envs.test.scripts] +unit = 'pytest -m "not integration" {args:tests}' +integration = 'pytest -m "integration" {args:tests}' +all = 'pytest {args:tests}' +unit-cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x -m "not integration" {args:tests}' +integration-cov-append-retry = 'pytest --cov=haystack_integrations --cov-append --reruns 3 --reruns-delay 30 -x -m "integration" {args:tests}' +types = """mypy -p haystack_integrations.document_stores.amazon_s3_vectors \ +-p haystack_integrations.components.retrievers.amazon_s3_vectors {args}""" + +[tool.mypy] +install_types = true +non_interactive = true +check_untyped_defs = true +disallow_incomplete_defs = true + +[[tool.mypy.overrides]] +module = [ +"boto3.*", +"botocore.*", +] +ignore_missing_imports = true + +[tool.ruff] +line-length = 120 + +[tool.ruff.lint] +select = [ + "A", + "ANN", + "ARG", + "B", + "C", + "D102", # Missing docstring in public method + "D103", # Missing docstring in public function + "D205", # 1 blank line required between summary line and description + "D209", # Closing triple quotes go to new line + "D213", # summary lines must be positioned on the second physical line of the docstring + "D417", # Missing argument descriptions in the docstring + "D419", # Docstring is empty + "DTZ", + "E", + "EM", + "F", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow function calls in argument defaults (common Haystack pattern for Secret.from_env_var) + "B008", + # Ignore checks for possible passwords + "S105", + "S106", + "S107", + # Ignore complexity + "C901", + "PLR0911", + "PLR0912", + "PLR0913", + "PLR0915", + # Allow `Any` type - used legitimately for dynamic types and SDK boundaries + "ANN401", +] + +[tool.ruff.lint.isort] +known-first-party = ["haystack_integrations"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "parents" + +[tool.ruff.lint.per-file-ignores] +# Tests can use magic values, assertions, relative imports, and don't need type annotations +"tests/**/*" = ["E501", "PLC0415", "PLR2004", "S101", "S110", "TID252", "D", "ANN"] +# Examples can contain print statements and long lines +"examples/**/*" = ["D", "T201", "E501"] + +[tool.coverage.run] +source = ["haystack_integrations"] +branch = true +parallel = false +relative_files = true + +[tool.coverage.report] +omit = ["*/tests/*", "*/__init__.py"] +show_missing = true +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] + +[tool.pytest.ini_options] +addopts = "--strict-markers" +markers = [ + "integration: integration tests", +] +log_cli = true +asyncio_default_fixture_loop_scope = "function" diff --git a/integrations/amazon_s3_vectors/src/haystack_integrations/components/py.typed b/integrations/amazon_s3_vectors/src/haystack_integrations/components/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/amazon_s3_vectors/src/haystack_integrations/components/retrievers/amazon_s3_vectors/__init__.py b/integrations/amazon_s3_vectors/src/haystack_integrations/components/retrievers/amazon_s3_vectors/__init__.py new file mode 100644 index 0000000000..39ebcd2f58 --- /dev/null +++ b/integrations/amazon_s3_vectors/src/haystack_integrations/components/retrievers/amazon_s3_vectors/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from .embedding_retriever import S3VectorsEmbeddingRetriever + +__all__ = ["S3VectorsEmbeddingRetriever"] diff --git a/integrations/amazon_s3_vectors/src/haystack_integrations/components/retrievers/amazon_s3_vectors/embedding_retriever.py b/integrations/amazon_s3_vectors/src/haystack_integrations/components/retrievers/amazon_s3_vectors/embedding_retriever.py new file mode 100644 index 0000000000..fbef8505d5 --- /dev/null +++ b/integrations/amazon_s3_vectors/src/haystack_integrations/components/retrievers/amazon_s3_vectors/embedding_retriever.py @@ -0,0 +1,130 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any + +from haystack import component, default_from_dict, default_to_dict +from haystack.dataclasses import Document +from haystack.document_stores.types import FilterPolicy +from haystack.document_stores.types.filter_policy import apply_filter_policy + +from haystack_integrations.document_stores.amazon_s3_vectors import S3VectorsDocumentStore + + +@component +class S3VectorsEmbeddingRetriever: + """ + Retrieve documents from an ``S3VectorsDocumentStore`` based on their dense embeddings. + + Usage example: + ```python + from haystack import Document, Pipeline + from haystack.components.embedders import SentenceTransformersTextEmbedder, SentenceTransformersDocumentEmbedder + from haystack.document_stores.types import DuplicatePolicy + from haystack_integrations.components.retrievers.amazon_s3_vectors import S3VectorsEmbeddingRetriever + from haystack_integrations.document_stores.amazon_s3_vectors import S3VectorsDocumentStore + + document_store = S3VectorsDocumentStore( + vector_bucket_name="my-vectors", + index_name="my-index", + dimension=768, + ) + + documents = [ + Document(content="There are over 7,000 languages spoken around the world today."), + Document(content="Elephants have been observed to behave in a way that indicates..."), + Document(content="In certain places, you can witness the phenomenon of bioluminescent waves."), + ] + + document_embedder = SentenceTransformersDocumentEmbedder() + document_embedder.warm_up() + documents_with_embeddings = document_embedder.run(documents) + + document_store.write_documents(documents_with_embeddings.get("documents"), policy=DuplicatePolicy.OVERWRITE) + + query_pipeline = Pipeline() + query_pipeline.add_component("text_embedder", SentenceTransformersTextEmbedder()) + query_pipeline.add_component("retriever", S3VectorsEmbeddingRetriever(document_store=document_store)) + query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding") + + query = "How many languages are there?" + res = query_pipeline.run({"text_embedder": {"text": query}}) + ``` + """ + + def __init__( + self, + *, + document_store: S3VectorsDocumentStore, + filters: dict[str, Any] | None = None, + top_k: int = 10, + filter_policy: str | FilterPolicy = FilterPolicy.REPLACE, + ) -> None: + """ + Initialize the S3VectorsEmbeddingRetriever. + + :param document_store: An instance of ``S3VectorsDocumentStore``. + :param filters: Filters applied to the retrieved Documents. + :param top_k: Maximum number of Documents to return. + :param filter_policy: Policy to determine how filters are applied. + :raises ValueError: If ``document_store`` is not an ``S3VectorsDocumentStore``. + """ + if not isinstance(document_store, S3VectorsDocumentStore): + msg = "document_store must be an instance of S3VectorsDocumentStore" + raise ValueError(msg) + + self.document_store = document_store + self.filters = filters or {} + self.top_k = top_k + self.filter_policy = ( + filter_policy if isinstance(filter_policy, FilterPolicy) else FilterPolicy.from_str(filter_policy) + ) + + def to_dict(self) -> dict[str, Any]: + """Serialize this component to a dictionary.""" + return default_to_dict( + self, + filters=self.filters, + top_k=self.top_k, + filter_policy=self.filter_policy.value, + document_store=self.document_store.to_dict(), + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "S3VectorsEmbeddingRetriever": + """Deserialize the component from a dictionary.""" + data["init_parameters"]["document_store"] = S3VectorsDocumentStore.from_dict( + data["init_parameters"]["document_store"] + ) + if filter_policy := data["init_parameters"].get("filter_policy"): + data["init_parameters"]["filter_policy"] = FilterPolicy.from_str(filter_policy) + return default_from_dict(cls, data) + + @component.output_types(documents=list[Document]) + def run( + self, + query_embedding: list[float], + filters: dict[str, Any] | None = None, + top_k: int | None = None, + ) -> dict[str, list[Document]]: + """ + Retrieve documents from the S3VectorsDocumentStore based on dense embeddings. + + :param query_embedding: Embedding of the query. + :param filters: Filters applied to the retrieved Documents. The way runtime filters are applied depends on + the ``filter_policy`` chosen at retriever initialization. Filters are applied server-side during + the vector search. + :param top_k: Maximum number of Documents to return. S3 Vectors caps this at 100. + :returns: A dictionary with key ``"documents"`` containing the retrieved Documents. + Returned documents will not contain embeddings. + """ + filters = apply_filter_policy(self.filter_policy, self.filters, filters) + top_k = top_k or self.top_k + + docs = self.document_store._embedding_retrieval( + query_embedding=query_embedding, + filters=filters, + top_k=top_k, + ) + return {"documents": docs} diff --git a/integrations/amazon_s3_vectors/src/haystack_integrations/document_stores/amazon_s3_vectors/__init__.py b/integrations/amazon_s3_vectors/src/haystack_integrations/document_stores/amazon_s3_vectors/__init__.py new file mode 100644 index 0000000000..4e6beddd97 --- /dev/null +++ b/integrations/amazon_s3_vectors/src/haystack_integrations/document_stores/amazon_s3_vectors/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from .document_store import S3VectorsDocumentStore + +__all__ = ["S3VectorsDocumentStore"] diff --git a/integrations/amazon_s3_vectors/src/haystack_integrations/document_stores/amazon_s3_vectors/document_store.py b/integrations/amazon_s3_vectors/src/haystack_integrations/document_stores/amazon_s3_vectors/document_store.py new file mode 100644 index 0000000000..3d6ddedf47 --- /dev/null +++ b/integrations/amazon_s3_vectors/src/haystack_integrations/document_stores/amazon_s3_vectors/document_store.py @@ -0,0 +1,524 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import base64 +import json +from dataclasses import replace +from typing import Any, Literal + +import boto3 +from botocore.exceptions import ClientError +from haystack import default_from_dict, default_to_dict, logging +from haystack.dataclasses import ByteStream, Document +from haystack.document_stores.errors import DocumentStoreError +from haystack.document_stores.types import DuplicatePolicy +from haystack.utils.auth import Secret, deserialize_secrets_inplace +from haystack.utils.filters import document_matches_filter + +from .filters import _normalize_filters, _validate_filters + +logger = logging.getLogger(__name__) + +# S3 Vectors allows up to 500 vectors per put_vectors call +_WRITE_BATCH_SIZE = 500 + +# S3 Vectors allows up to 100 keys per get_vectors call, 500 per delete_vectors call +_GET_BATCH_SIZE = 100 +_DELETE_BATCH_SIZE = 500 + +# Maximum number of results from query_vectors +_MAX_TOP_K = 100 + +# S3 Vectors metadata limits +_MAX_TOTAL_METADATA_BYTES = 40 * 1024 # 40 KB total + +# Reserved metadata keys used to store Haystack Document fields +_CONTENT_KEY = "_content" +_BLOB_DATA_KEY = "_blob_data" +_BLOB_META_KEY = "_blob_meta" +_BLOB_MIME_TYPE_KEY = "_blob_mime_type" + +_RESERVED_META_KEYS = {_CONTENT_KEY, _BLOB_DATA_KEY, _BLOB_META_KEY, _BLOB_MIME_TYPE_KEY} + +# These keys are stored but should not be used in query filters. +# They are configured as nonFilterableMetadataKeys on the index. +_NON_FILTERABLE_KEYS = [_CONTENT_KEY, _BLOB_DATA_KEY, _BLOB_META_KEY, _BLOB_MIME_TYPE_KEY] + + +class S3VectorsDocumentStore: + """ + A Document Store using [Amazon S3 Vectors](https://aws.amazon.com/s3/features/vectors/). + + Amazon S3 Vectors provides serverless vector storage and similarity search within Amazon S3. + This document store stores Haystack `Document` objects as vectors with associated metadata + in an S3 vector bucket and index. + + **Service limits:** + + - Maximum ``top_k``: 100 results per query + - Maximum vector dimension: 4,096 + - Metadata per vector: 40 KB total, 2 KB filterable + - All documents must have embeddings (``float32`` only) + - Distance metrics: ``cosine`` or ``euclidean`` (set at index creation, immutable) + - ``filter_documents()`` is client-side — prefer ``S3VectorsEmbeddingRetriever`` with filters + + Usage example: + ```python + from haystack_integrations.document_stores.amazon_s3_vectors import S3VectorsDocumentStore + + document_store = S3VectorsDocumentStore( + vector_bucket_name="my-vectors", + index_name="my-index", + dimension=768, + ) + ``` + """ + + def __init__( + self, + *, + vector_bucket_name: str, + index_name: str, + dimension: int, + distance_metric: Literal["cosine", "euclidean"] = "cosine", + region_name: str | None = None, + aws_access_key_id: Secret | None = None, + aws_secret_access_key: Secret | None = None, + aws_session_token: Secret | None = None, + create_bucket_and_index: bool = True, + non_filterable_metadata_keys: list[str] | None = None, + ) -> None: + """ + Create an S3VectorsDocumentStore instance. + + :param vector_bucket_name: Name of the S3 vector bucket. + :param index_name: Name of the vector index within the bucket. + :param dimension: Dimensionality of the embeddings (e.g. 768, 1536). + :param distance_metric: Distance metric for similarity search: ``"cosine"`` or ``"euclidean"``. + :param region_name: AWS region. If not provided, uses the default from the environment/config. + :param aws_access_key_id: AWS access key ID. If not provided, uses the default credential chain. + :param aws_secret_access_key: AWS secret access key. + :param aws_session_token: AWS session token for temporary credentials. + :param create_bucket_and_index: Whether to automatically create the vector bucket and index + if they do not exist. Defaults to ``True``. + :param non_filterable_metadata_keys: Additional metadata keys to mark as non-filterable + on the index (beyond the internal keys used for Document content/blob storage). + """ + self.vector_bucket_name = vector_bucket_name + self.index_name = index_name + self.dimension = dimension + self.distance_metric = distance_metric + self.region_name = region_name + self.aws_access_key_id = aws_access_key_id + self.aws_secret_access_key = aws_secret_access_key + self.aws_session_token = aws_session_token + self.create_bucket_and_index = create_bucket_and_index + self.non_filterable_metadata_keys = non_filterable_metadata_keys or [] + + self._client: Any = None + + def _get_client(self) -> Any: + """Lazily create and return the boto3 s3vectors client.""" + if self._client is not None: + return self._client + + kwargs: dict[str, Any] = {"service_name": "s3vectors"} + if self.region_name: + kwargs["region_name"] = self.region_name + if self.aws_access_key_id: + kwargs["aws_access_key_id"] = self.aws_access_key_id.resolve_value() + if self.aws_secret_access_key: + kwargs["aws_secret_access_key"] = self.aws_secret_access_key.resolve_value() + if self.aws_session_token: + kwargs["aws_session_token"] = self.aws_session_token.resolve_value() + + self._client = boto3.client(**kwargs) + + if self.create_bucket_and_index: + self._ensure_bucket_and_index() + + return self._client + + def _ensure_bucket_and_index(self) -> None: + """Create the vector bucket and index if they don't already exist.""" + client = self._client + + # Ensure bucket exists + try: + client.get_vector_bucket(vectorBucketName=self.vector_bucket_name) + logger.info("Using existing vector bucket '{bucket}'.", bucket=self.vector_bucket_name) + except ClientError as e: + if e.response["Error"]["Code"] == "NotFoundException": + logger.info("Creating vector bucket '{bucket}'.", bucket=self.vector_bucket_name) + client.create_vector_bucket(vectorBucketName=self.vector_bucket_name) + else: + raise + + # Ensure index exists + all_non_filterable = list(set(_NON_FILTERABLE_KEYS + self.non_filterable_metadata_keys)) + try: + client.get_index(vectorBucketName=self.vector_bucket_name, indexName=self.index_name) + logger.info( + "Using existing index '{index}' in bucket '{bucket}'. " + "`dimension`, `distance_metric`, and `non_filterable_metadata_keys` will be ignored.", + index=self.index_name, + bucket=self.vector_bucket_name, + ) + except ClientError as e: + if e.response["Error"]["Code"] == "NotFoundException": + logger.info( + "Creating index '{index}' in bucket '{bucket}' (dimension={dim}, metric={metric}).", + index=self.index_name, + bucket=self.vector_bucket_name, + dim=self.dimension, + metric=self.distance_metric, + ) + client.create_index( + vectorBucketName=self.vector_bucket_name, + indexName=self.index_name, + dataType="float32", + dimension=self.dimension, + distanceMetric=self.distance_metric, + metadataConfiguration={"nonFilterableMetadataKeys": all_non_filterable}, + ) + else: + raise + + def to_dict(self) -> dict[str, Any]: + """Serialize this document store to a dictionary.""" + return default_to_dict( + self, + vector_bucket_name=self.vector_bucket_name, + index_name=self.index_name, + dimension=self.dimension, + distance_metric=self.distance_metric, + region_name=self.region_name, + aws_access_key_id=self.aws_access_key_id.to_dict() if self.aws_access_key_id else None, + aws_secret_access_key=self.aws_secret_access_key.to_dict() if self.aws_secret_access_key else None, + aws_session_token=self.aws_session_token.to_dict() if self.aws_session_token else None, + create_bucket_and_index=self.create_bucket_and_index, + non_filterable_metadata_keys=self.non_filterable_metadata_keys, + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "S3VectorsDocumentStore": + """Deserialize a document store from a dictionary.""" + deserialize_secrets_inplace( + data["init_parameters"], + keys=["aws_access_key_id", "aws_secret_access_key", "aws_session_token"], + ) + return default_from_dict(cls, data) + + def count_documents(self) -> int: + """ + Return the number of documents in the document store. + + .. note:: + + S3 Vectors does not provide a dedicated count API. This method lists all vector keys + via pagination, which can be slow for large indexes. + """ + client = self._get_client() + count = 0 + next_token = None + while True: + kwargs: dict[str, Any] = { + "vectorBucketName": self.vector_bucket_name, + "indexName": self.index_name, + } + if next_token: + kwargs["nextToken"] = next_token + response = client.list_vectors(**kwargs) + count += len(response.get("vectors", [])) + next_token = response.get("nextToken") + if not next_token: + break + return count + + def write_documents(self, documents: list[Document], policy: DuplicatePolicy = DuplicatePolicy.OVERWRITE) -> int: + """ + Write Documents to the S3 Vectors index. + + All documents must have an embedding set. S3 Vectors ``put_vectors`` is an upsert operation + by default, so ``DuplicatePolicy.OVERWRITE`` is the natural behavior. + ``DuplicatePolicy.SKIP`` will check for existing documents first (slower). + ``DuplicatePolicy.NONE`` will raise an error if a document already exists. + + Metadata per vector is limited to 40 KB total (2 KB filterable). + + :param documents: A list of Documents to write. Each document must have an embedding. + :param policy: The duplicate policy. Defaults to ``DuplicatePolicy.OVERWRITE``. + :returns: The number of documents written. + """ + if len(documents) == 0: + return 0 + + client = self._get_client() + written = 0 + + for i in range(0, len(documents), _WRITE_BATCH_SIZE): + batch = documents[i : i + _WRITE_BATCH_SIZE] + + # Validate embeddings upfront + for doc in batch: + if doc.embedding is None: + msg = f"Document '{doc.id}' has no embedding. S3VectorsDocumentStore requires embeddings." + raise DocumentStoreError(msg) + + # Batch-check for existing documents when needed + existing_ids: set[str] = set() + if policy in (DuplicatePolicy.SKIP, DuplicatePolicy.NONE): + batch_ids = [doc.id for doc in batch] + for j in range(0, len(batch_ids), _GET_BATCH_SIZE): + id_chunk = batch_ids[j : j + _GET_BATCH_SIZE] + response = client.get_vectors( + vectorBucketName=self.vector_bucket_name, + indexName=self.index_name, + keys=id_chunk, + ) + for v in response.get("vectors", []): + existing_ids.add(v["key"]) + + if policy == DuplicatePolicy.NONE and existing_ids: + msg = ( + f"Document(s) {sorted(existing_ids)} already exist in the document store. " + "Use DuplicatePolicy.OVERWRITE or DuplicatePolicy.SKIP." + ) + raise DocumentStoreError(msg) + + vectors_to_write = [] + for doc in batch: + if policy == DuplicatePolicy.SKIP and doc.id in existing_ids: + continue + vectors_to_write.append(self._document_to_s3_vector(doc)) + + if vectors_to_write: + client.put_vectors( + vectorBucketName=self.vector_bucket_name, + indexName=self.index_name, + vectors=vectors_to_write, + ) + written += len(vectors_to_write) + + return written + + def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Document]: + """ + Return documents matching the provided filters. + + .. warning:: + + S3 Vectors only supports metadata filtering during vector similarity queries, not as a + standalone operation. This method lists all vectors and applies filters client-side, + which can be very slow for large indexes. For filtered retrieval, prefer using + ``S3VectorsEmbeddingRetriever`` with filters instead. + + :param filters: Haystack-format filters to apply. + :returns: A list of matching Documents. + """ + if filters: + logger.warning( + "S3 Vectors does not support standalone filtered listing. " + "filter_documents() will fetch ALL vectors and apply filters client-side, " + "which can be very slow for large indexes. " + "Prefer using S3VectorsEmbeddingRetriever with filters for efficient filtered retrieval." + ) + + client = self._get_client() + + # list_vectors supports returnData and returnMetadata directly, + # so we can read documents in a single paginated pass without + # a separate get_vectors round-trip. + documents: list[Document] = [] + next_token = None + while True: + kwargs: dict[str, Any] = { + "vectorBucketName": self.vector_bucket_name, + "indexName": self.index_name, + "returnData": True, + "returnMetadata": True, + } + if next_token: + kwargs["nextToken"] = next_token + response = client.list_vectors(**kwargs) + for v in response.get("vectors", []): + documents.append(self._s3_vector_to_document(v)) + next_token = response.get("nextToken") + if not next_token: + break + + if filters: + _validate_filters(filters) + documents = [doc for doc in documents if document_matches_filter(filters=filters, document=doc)] + + return documents + + def delete_documents(self, document_ids: list[str]) -> None: + """ + Delete documents by their IDs. + + :param document_ids: List of document IDs to delete. + """ + if not document_ids: + return + + client = self._get_client() + for i in range(0, len(document_ids), _DELETE_BATCH_SIZE): + batch = document_ids[i : i + _DELETE_BATCH_SIZE] + client.delete_vectors( + vectorBucketName=self.vector_bucket_name, + indexName=self.index_name, + keys=batch, + ) + + def _embedding_retrieval( + self, + query_embedding: list[float], + *, + filters: dict[str, Any] | None = None, + top_k: int = 10, + ) -> list[Document]: + """ + Retrieve documents most similar to the query embedding. + + This method is not part of the public interface. + Use ``S3VectorsEmbeddingRetriever`` instead. + + :param query_embedding: The query embedding vector. + :param filters: Optional Haystack-format metadata filters. + :param top_k: Maximum number of results to return. S3 Vectors caps this at 100. + :returns: List of Documents sorted by similarity. Returned documents will not contain + embeddings (S3 Vectors ``query_vectors`` does not return vector data). + """ + if not query_embedding: + msg = "query_embedding must be a non-empty list of floats" + raise ValueError(msg) + + _validate_filters(filters) + s3_filter = _normalize_filters(filters) if filters else None + + if top_k > _MAX_TOP_K: + logger.warning( + "Requested top_k={top_k} exceeds S3 Vectors maximum of {max_k}. Results will be capped.", + top_k=top_k, + max_k=_MAX_TOP_K, + ) + + client = self._get_client() + + query_kwargs: dict[str, Any] = { + "vectorBucketName": self.vector_bucket_name, + "indexName": self.index_name, + "topK": min(top_k, _MAX_TOP_K), # S3 Vectors caps at 100 + "queryVector": {"float32": query_embedding}, + "returnMetadata": True, + "returnDistance": True, + } + if s3_filter: + query_kwargs["filter"] = s3_filter + + result = client.query_vectors(**query_kwargs) + + # Convert distance to a score. + # For cosine: S3 Vectors returns cosine *distance* (0 = identical, 2 = opposite). + # Haystack convention is higher score = more similar, so we convert: score = 1 - distance. + # For euclidean: we negate the distance so that closer vectors score higher. + distance_metric = result.get("distanceMetric", self.distance_metric) + + documents = [] + for v in result.get("vectors", []): + doc = self._s3_vector_to_document(v) + + # Compute score from distance + score = None + raw_distance = v.get("distance") + if raw_distance is not None: + if distance_metric == "cosine": + score = 1.0 - raw_distance + else: + # euclidean: negate so higher = more similar + score = -raw_distance + + # query_vectors does not return vector data; attach score + documents.append(replace(doc, embedding=None, score=score)) + + return documents + + @staticmethod + def _document_to_s3_vector(doc: Document) -> dict[str, Any]: + """Convert a Haystack Document to an S3 Vectors vector entry.""" + metadata: dict[str, Any] = {} + + # Store content as non-filterable metadata + if doc.content is not None: + metadata[_CONTENT_KEY] = doc.content + + # Store blob fields + if doc.blob is not None: + metadata[_BLOB_DATA_KEY] = base64.b64encode(doc.blob.data).decode("ascii") + if doc.blob.meta: + metadata[_BLOB_META_KEY] = doc.blob.meta + if doc.blob.mime_type: + metadata[_BLOB_MIME_TYPE_KEY] = doc.blob.mime_type + + # Store user metadata + if doc.meta: + for key, value in doc.meta.items(): + if key in _RESERVED_META_KEYS: + logger.warning( + "Metadata key '{key}' is reserved; user value will be ignored.", + key=key, + ) + continue + metadata[key] = value + + # Warn if metadata is likely too large + try: + meta_size = len(json.dumps(metadata).encode("utf-8")) + if meta_size > _MAX_TOTAL_METADATA_BYTES: + logger.warning( + "Document '{doc_id}' has ~{size} bytes of metadata, exceeding the S3 Vectors " + "limit of {limit} bytes. The put_vectors call may fail.", + doc_id=doc.id, + size=meta_size, + limit=_MAX_TOTAL_METADATA_BYTES, + ) + except (TypeError, ValueError): + pass # Best-effort size check + + return { + "key": doc.id, + "data": {"float32": doc.embedding}, + "metadata": metadata, + } + + @staticmethod + def _s3_vector_to_document(vector: dict[str, Any]) -> Document: + """Convert an S3 Vectors vector response to a Haystack Document.""" + metadata = dict(vector.get("metadata", {})) + content = metadata.pop(_CONTENT_KEY, None) + blob_data = metadata.pop(_BLOB_DATA_KEY, None) + blob_meta = metadata.pop(_BLOB_META_KEY, None) + blob_mime_type = metadata.pop(_BLOB_MIME_TYPE_KEY, None) + + blob = None + if blob_data is not None: + blob = ByteStream( + data=base64.b64decode(blob_data) if isinstance(blob_data, str) else blob_data, + meta=blob_meta or {}, + mime_type=blob_mime_type, + ) + + embedding = None + data = vector.get("data", {}) + if "float32" in data: + embedding = data["float32"] + + return Document( + id=vector["key"], + content=content, + meta=metadata, + embedding=embedding, + blob=blob, + ) diff --git a/integrations/amazon_s3_vectors/src/haystack_integrations/document_stores/amazon_s3_vectors/filters.py b/integrations/amazon_s3_vectors/src/haystack_integrations/document_stores/amazon_s3_vectors/filters.py new file mode 100644 index 0000000000..3141868f12 --- /dev/null +++ b/integrations/amazon_s3_vectors/src/haystack_integrations/document_stores/amazon_s3_vectors/filters.py @@ -0,0 +1,144 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any + +from haystack.errors import FilterError + + +def _normalize_filters(filters: dict[str, Any]) -> dict[str, Any]: + """ + Convert Haystack filters to Amazon S3 Vectors compatible filters. + + Reference: https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-vectors-metadata-filtering.html + """ + if not isinstance(filters, dict): + msg = "Filters must be a dictionary" + raise FilterError(msg) + + if "field" in filters: + return _parse_comparison_condition(filters) + return _parse_logical_condition(filters) + + +def _parse_logical_condition(condition: dict[str, Any]) -> dict[str, Any]: + if "operator" not in condition: + msg = f"'operator' key missing in {condition}" + raise FilterError(msg) + if "conditions" not in condition: + msg = f"'conditions' key missing in {condition}" + raise FilterError(msg) + + operator = condition["operator"] + conditions = [_normalize_filters(c) for c in condition["conditions"]] + + if operator in LOGICAL_OPERATORS: + return {LOGICAL_OPERATORS[operator]: conditions} + + msg = f"Unknown logical operator '{operator}'" + raise FilterError(msg) + + +def _parse_comparison_condition(condition: dict[str, Any]) -> dict[str, Any]: + if "field" not in condition: + return _parse_logical_condition(condition) + + field: str = condition["field"] + if "operator" not in condition: + msg = f"'operator' key missing in {condition}" + raise FilterError(msg) + if "value" not in condition: + msg = f"'value' key missing in {condition}" + raise FilterError(msg) + + operator: str = condition["operator"] + value: Any = condition["value"] + + # Strip the "meta." prefix — metadata is stored flat in S3 Vectors + if field.startswith("meta."): + field = field[5:] + + if operator not in COMPARISON_OPERATORS: + msg = f"Unknown comparison operator '{operator}'" + raise FilterError(msg) + + return COMPARISON_OPERATORS[operator](field, value) + + +def _equal(field: str, value: Any) -> dict[str, Any]: + _assert_supported_type(value, (str, int, float, bool), "equal") + return {field: {"$eq": value}} + + +def _not_equal(field: str, value: Any) -> dict[str, Any]: + _assert_supported_type(value, (str, int, float, bool), "not equal") + return {field: {"$ne": value}} + + +def _greater_than(field: str, value: Any) -> dict[str, Any]: + _assert_supported_type(value, (int, float), "greater than") + return {field: {"$gt": value}} + + +def _greater_than_equal(field: str, value: Any) -> dict[str, Any]: + _assert_supported_type(value, (int, float), "greater than equal") + return {field: {"$gte": value}} + + +def _less_than(field: str, value: Any) -> dict[str, Any]: + _assert_supported_type(value, (int, float), "less than") + return {field: {"$lt": value}} + + +def _less_than_equal(field: str, value: Any) -> dict[str, Any]: + _assert_supported_type(value, (int, float), "less than equal") + return {field: {"$lte": value}} + + +def _in(field: str, value: Any) -> dict[str, Any]: + if not isinstance(value, list): + msg = f"{field}'s value must be a list when using 'in' comparator" + raise FilterError(msg) + for v in value: + _assert_supported_type(v, (str, int, float), "in") + return {field: {"$in": value}} + + +def _not_in(field: str, value: Any) -> dict[str, Any]: + if not isinstance(value, list): + msg = f"{field}'s value must be a list when using 'not in' comparator" + raise FilterError(msg) + for v in value: + _assert_supported_type(v, (str, int, float), "not in") + return {field: {"$nin": value}} + + +def _assert_supported_type(value: Any, supported_types: tuple[type, ...], operator_name: str) -> None: + if not isinstance(value, supported_types): + msg = ( + f"Unsupported type for '{operator_name}' comparison: {type(value)}. " + f"Types supported by S3 Vectors are: {supported_types}" + ) + raise FilterError(msg) + + +COMPARISON_OPERATORS = { + "==": _equal, + "!=": _not_equal, + ">": _greater_than, + ">=": _greater_than_equal, + "<": _less_than, + "<=": _less_than_equal, + "in": _in, + "not in": _not_in, +} + +LOGICAL_OPERATORS = {"AND": "$and", "OR": "$or"} + + +def _validate_filters(filters: dict[str, Any] | None) -> None: + """Validate Haystack filter syntax.""" + if filters and "operator" not in filters and "conditions" not in filters: + msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details." + raise ValueError(msg) diff --git a/integrations/amazon_s3_vectors/src/haystack_integrations/document_stores/py.typed b/integrations/amazon_s3_vectors/src/haystack_integrations/document_stores/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/amazon_s3_vectors/tests/__init__.py b/integrations/amazon_s3_vectors/tests/__init__.py new file mode 100644 index 0000000000..c1764a6e03 --- /dev/null +++ b/integrations/amazon_s3_vectors/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/amazon_s3_vectors/tests/test_document_store.py b/integrations/amazon_s3_vectors/tests/test_document_store.py new file mode 100644 index 0000000000..8df69ffc49 --- /dev/null +++ b/integrations/amazon_s3_vectors/tests/test_document_store.py @@ -0,0 +1,241 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import MagicMock, patch + +import pytest +from haystack.dataclasses import Document +from haystack.document_stores.errors import DocumentStoreError +from haystack.document_stores.types import DuplicatePolicy + +from haystack_integrations.document_stores.amazon_s3_vectors import S3VectorsDocumentStore + + +def test_init_is_lazy(): + store = S3VectorsDocumentStore( + vector_bucket_name="my-bucket", + index_name="my-index", + dimension=768, + ) + assert store._client is None + + +def test_init_default_params(): + store = S3VectorsDocumentStore( + vector_bucket_name="my-bucket", + index_name="my-index", + dimension=768, + ) + assert store.vector_bucket_name == "my-bucket" + assert store.index_name == "my-index" + assert store.dimension == 768 + assert store.distance_metric == "cosine" + assert store.create_bucket_and_index is True + assert store.non_filterable_metadata_keys == [] + + +@patch("haystack_integrations.document_stores.amazon_s3_vectors.document_store.boto3") +def test_to_dict(_mock_boto3): + store = S3VectorsDocumentStore( + vector_bucket_name="my-bucket", + index_name="my-index", + dimension=768, + distance_metric="euclidean", + region_name="us-west-2", + create_bucket_and_index=False, + ) + d = store.to_dict() + assert d == { + "type": "haystack_integrations.document_stores.amazon_s3_vectors.document_store.S3VectorsDocumentStore", + "init_parameters": { + "vector_bucket_name": "my-bucket", + "index_name": "my-index", + "dimension": 768, + "distance_metric": "euclidean", + "region_name": "us-west-2", + "aws_access_key_id": None, + "aws_secret_access_key": None, + "aws_session_token": None, + "create_bucket_and_index": False, + "non_filterable_metadata_keys": [], + }, + } + + +@patch("haystack_integrations.document_stores.amazon_s3_vectors.document_store.boto3") +def test_from_dict(_mock_boto3): + data = { + "type": "haystack_integrations.document_stores.amazon_s3_vectors.document_store.S3VectorsDocumentStore", + "init_parameters": { + "vector_bucket_name": "my-bucket", + "index_name": "my-index", + "dimension": 768, + "distance_metric": "euclidean", + "region_name": "us-west-2", + "aws_access_key_id": None, + "aws_secret_access_key": None, + "aws_session_token": None, + "create_bucket_and_index": False, + "non_filterable_metadata_keys": [], + }, + } + store = S3VectorsDocumentStore.from_dict(data) + assert store.vector_bucket_name == "my-bucket" + assert store.index_name == "my-index" + assert store.dimension == 768 + assert store.distance_metric == "euclidean" + assert store.region_name == "us-west-2" + assert store.create_bucket_and_index is False + + +@patch("haystack_integrations.document_stores.amazon_s3_vectors.document_store.boto3") +def test_write_documents_no_embedding_raises(mock_boto3): + """S3 Vectors requires embeddings — this tests our validation, not the store.""" + client = MagicMock() + client.get_vector_bucket.return_value = {} + client.get_index.return_value = {} + mock_boto3.client.return_value = client + + store = S3VectorsDocumentStore(vector_bucket_name="b", index_name="i", dimension=4, region_name="us-east-1") + with pytest.raises(DocumentStoreError, match="has no embedding"): + store.write_documents([Document(id="1", content="Hello")]) + + +@patch("haystack_integrations.document_stores.amazon_s3_vectors.document_store.boto3") +def test_write_documents_skip_existing(mock_boto3): + """Tests our batch existence check logic for SKIP policy.""" + client = MagicMock() + client.get_vector_bucket.return_value = {} + client.get_index.return_value = {} + client.get_vectors.return_value = {"vectors": [{"key": "1"}]} + mock_boto3.client.return_value = client + + store = S3VectorsDocumentStore(vector_bucket_name="b", index_name="i", dimension=4, region_name="us-east-1") + result = store.write_documents( + [Document(id="1", content="Hello", embedding=[0.1] * 4)], + policy=DuplicatePolicy.SKIP, + ) + assert result == 0 + client.put_vectors.assert_not_called() + + +@patch("haystack_integrations.document_stores.amazon_s3_vectors.document_store.boto3") +def test_write_documents_none_policy_raises(mock_boto3): + """Tests our batch existence check logic for NONE policy.""" + client = MagicMock() + client.get_vector_bucket.return_value = {} + client.get_index.return_value = {} + client.get_vectors.return_value = {"vectors": [{"key": "1"}]} + mock_boto3.client.return_value = client + + store = S3VectorsDocumentStore(vector_bucket_name="b", index_name="i", dimension=4, region_name="us-east-1") + with pytest.raises(DocumentStoreError, match="already exist"): + store.write_documents( + [Document(id="1", content="Hello", embedding=[0.1] * 4)], + policy=DuplicatePolicy.NONE, + ) + + +@patch("haystack_integrations.document_stores.amazon_s3_vectors.document_store.boto3") +def test_embedding_retrieval_score_conversion(mock_boto3): + """Tests our distance-to-score conversion logic — the only non-trivial transform in retrieval.""" + client = MagicMock() + client.get_vector_bucket.return_value = {} + client.get_index.return_value = {} + client.query_vectors.return_value = { + "vectors": [{"key": "1", "distance": 0.05, "metadata": {"_content": "Hello", "category": "news"}}], + "distanceMetric": "cosine", + } + mock_boto3.client.return_value = client + + store = S3VectorsDocumentStore(vector_bucket_name="b", index_name="i", dimension=4, region_name="us-east-1") + docs = store._embedding_retrieval(query_embedding=[0.1] * 4, top_k=5) + assert len(docs) == 1 + assert docs[0].id == "1" + assert docs[0].content == "Hello" + assert docs[0].score == pytest.approx(0.95) # cosine: 1.0 - 0.05 + assert docs[0].meta == {"category": "news"} + + +@patch("haystack_integrations.document_stores.amazon_s3_vectors.document_store.boto3") +def test_embedding_retrieval_euclidean_score(mock_boto3): + """Tests euclidean distance-to-score conversion (negated).""" + client = MagicMock() + client.get_vector_bucket.return_value = {} + client.get_index.return_value = {} + client.query_vectors.return_value = { + "vectors": [{"key": "1", "distance": 1.5, "metadata": {}}], + "distanceMetric": "euclidean", + } + mock_boto3.client.return_value = client + + store = S3VectorsDocumentStore( + vector_bucket_name="b", index_name="i", dimension=4, distance_metric="euclidean", region_name="us-east-1" + ) + docs = store._embedding_retrieval(query_embedding=[0.1] * 4, top_k=5) + assert docs[0].score == pytest.approx(-1.5) # euclidean: negated + + +@patch("haystack_integrations.document_stores.amazon_s3_vectors.document_store.boto3") +def test_embedding_retrieval_passes_filters(mock_boto3): + """Tests that Haystack filters are converted and passed to query_vectors.""" + client = MagicMock() + client.get_vector_bucket.return_value = {} + client.get_index.return_value = {} + client.query_vectors.return_value = {"vectors": [], "distanceMetric": "cosine"} + mock_boto3.client.return_value = client + + store = S3VectorsDocumentStore(vector_bucket_name="b", index_name="i", dimension=4, region_name="us-east-1") + filters = {"operator": "AND", "conditions": [{"field": "meta.category", "operator": "==", "value": "news"}]} + store._embedding_retrieval(query_embedding=[0.1] * 4, filters=filters, top_k=5) + + call_args = client.query_vectors.call_args[1] + assert call_args["filter"] == {"$and": [{"category": {"$eq": "news"}}]} + + +def test_embedding_retrieval_empty_embedding_raises(): + """Tests our input validation — no mocking needed.""" + store = S3VectorsDocumentStore(vector_bucket_name="b", index_name="i", dimension=4, create_bucket_and_index=False) + with pytest.raises(ValueError, match="non-empty"): + store._embedding_retrieval(query_embedding=[]) + + +def test_document_to_s3_vector(): + """Tests our Document → S3 vector conversion (pure function).""" + doc = Document( + id="test-1", content="Hello world", embedding=[0.1, 0.2, 0.3], meta={"category": "test", "year": 2024} + ) + result = S3VectorsDocumentStore._document_to_s3_vector(doc) + assert result["key"] == "test-1" + assert result["data"] == {"float32": [0.1, 0.2, 0.3]} + assert result["metadata"]["_content"] == "Hello world" + assert result["metadata"]["category"] == "test" + assert result["metadata"]["year"] == 2024 + + +def test_s3_vector_to_document(): + """Tests our S3 vector → Document conversion (pure function).""" + vector = { + "key": "test-1", + "data": {"float32": [0.1, 0.2, 0.3]}, + "metadata": {"_content": "Hello world", "category": "test"}, + } + doc = S3VectorsDocumentStore._s3_vector_to_document(vector) + assert doc.id == "test-1" + assert doc.content == "Hello world" + assert doc.embedding == [0.1, 0.2, 0.3] + assert doc.meta == {"category": "test"} + + +def test_document_roundtrip(): + """Tests Document → S3 vector → Document is lossless.""" + doc = Document( + id="test-1", content="Hello world", embedding=[0.1, 0.2, 0.3], meta={"category": "test", "year": 2024} + ) + vector = S3VectorsDocumentStore._document_to_s3_vector(doc) + restored = S3VectorsDocumentStore._s3_vector_to_document(vector) + assert restored.id == doc.id + assert restored.content == doc.content + assert restored.embedding == doc.embedding + assert restored.meta == doc.meta diff --git a/integrations/amazon_s3_vectors/tests/test_embedding_retriever.py b/integrations/amazon_s3_vectors/tests/test_embedding_retriever.py new file mode 100644 index 0000000000..3f9e2e46da --- /dev/null +++ b/integrations/amazon_s3_vectors/tests/test_embedding_retriever.py @@ -0,0 +1,132 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from unittest.mock import Mock, patch + +import pytest +from haystack.dataclasses import Document +from haystack.document_stores.types import FilterPolicy + +from haystack_integrations.components.retrievers.amazon_s3_vectors import S3VectorsEmbeddingRetriever +from haystack_integrations.document_stores.amazon_s3_vectors import S3VectorsDocumentStore + + +def test_init_default(): + mock_store = Mock(spec=S3VectorsDocumentStore) + retriever = S3VectorsEmbeddingRetriever(document_store=mock_store) + assert retriever.document_store == mock_store + assert retriever.filters == {} + assert retriever.top_k == 10 + assert retriever.filter_policy == FilterPolicy.REPLACE + + retriever = S3VectorsEmbeddingRetriever(document_store=mock_store, filter_policy="replace") + assert retriever.filter_policy == FilterPolicy.REPLACE + + with pytest.raises(ValueError): + S3VectorsEmbeddingRetriever(document_store=mock_store, filter_policy="invalid") + + +@patch("haystack_integrations.document_stores.amazon_s3_vectors.document_store.boto3") +def test_to_dict(_mock_boto3): + store = S3VectorsDocumentStore( + vector_bucket_name="test-bucket", + index_name="test-index", + dimension=768, + region_name="us-east-1", + create_bucket_and_index=False, + ) + retriever = S3VectorsEmbeddingRetriever(document_store=store, top_k=5) + d = retriever.to_dict() + assert d == { + "type": "haystack_integrations.components.retrievers.amazon_s3_vectors.embedding_retriever.S3VectorsEmbeddingRetriever", + "init_parameters": { + "document_store": { + "type": "haystack_integrations.document_stores.amazon_s3_vectors.document_store.S3VectorsDocumentStore", + "init_parameters": { + "vector_bucket_name": "test-bucket", + "index_name": "test-index", + "dimension": 768, + "distance_metric": "cosine", + "region_name": "us-east-1", + "aws_access_key_id": None, + "aws_secret_access_key": None, + "aws_session_token": None, + "create_bucket_and_index": False, + "non_filterable_metadata_keys": [], + }, + }, + "filters": {}, + "top_k": 5, + "filter_policy": "replace", + }, + } + + +@patch("haystack_integrations.document_stores.amazon_s3_vectors.document_store.boto3") +def test_from_dict(_mock_boto3): + data = { + "type": "haystack_integrations.components.retrievers.amazon_s3_vectors.embedding_retriever.S3VectorsEmbeddingRetriever", + "init_parameters": { + "document_store": { + "type": "haystack_integrations.document_stores.amazon_s3_vectors.document_store.S3VectorsDocumentStore", + "init_parameters": { + "vector_bucket_name": "test-bucket", + "index_name": "test-index", + "dimension": 768, + "distance_metric": "cosine", + "region_name": "us-east-1", + "aws_access_key_id": None, + "aws_secret_access_key": None, + "aws_session_token": None, + "create_bucket_and_index": False, + "non_filterable_metadata_keys": [], + }, + }, + "filters": {}, + "top_k": 5, + "filter_policy": "replace", + }, + } + retriever = S3VectorsEmbeddingRetriever.from_dict(data) + assert retriever.top_k == 5 + assert retriever.filter_policy == FilterPolicy.REPLACE + assert retriever.document_store.vector_bucket_name == "test-bucket" + assert retriever.document_store.dimension == 768 + + +@patch("haystack_integrations.document_stores.amazon_s3_vectors.document_store.boto3") +def test_from_dict_no_filter_policy(_mock_boto3): + """Pipelines serialized with older versions may not have filter_policy.""" + data = { + "type": "haystack_integrations.components.retrievers.amazon_s3_vectors.embedding_retriever.S3VectorsEmbeddingRetriever", + "init_parameters": { + "document_store": { + "type": "haystack_integrations.document_stores.amazon_s3_vectors.document_store.S3VectorsDocumentStore", + "init_parameters": { + "vector_bucket_name": "test-bucket", + "index_name": "test-index", + "dimension": 768, + "create_bucket_and_index": False, + }, + }, + "filters": {}, + "top_k": 10, + }, + } + retriever = S3VectorsEmbeddingRetriever.from_dict(data) + assert retriever.filter_policy == FilterPolicy.REPLACE # default + + +def test_run(): + mock_store = Mock(spec=S3VectorsDocumentStore) + mock_store._embedding_retrieval.return_value = [Document(content="Test doc", embedding=[0.1, 0.2])] + retriever = S3VectorsEmbeddingRetriever(document_store=mock_store) + res = retriever.run(query_embedding=[0.5, 0.7]) + mock_store._embedding_retrieval.assert_called_once_with( + query_embedding=[0.5, 0.7], + filters={}, + top_k=10, + ) + assert len(res["documents"]) == 1 + assert res["documents"][0].content == "Test doc" diff --git a/integrations/amazon_s3_vectors/tests/test_filters.py b/integrations/amazon_s3_vectors/tests/test_filters.py new file mode 100644 index 0000000000..11d83a282d --- /dev/null +++ b/integrations/amazon_s3_vectors/tests/test_filters.py @@ -0,0 +1,89 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +""" +Unit tests for the Haystack → S3 Vectors filter conversion. + +This is a pure function with real logic (operator mapping, type validation, +meta. prefix stripping, logical nesting) so unit testing is high signal. +""" + +import pytest +from haystack.errors import FilterError + +from haystack_integrations.document_stores.amazon_s3_vectors.filters import _normalize_filters, _validate_filters + + +def test_comparison_operators(): + """All comparison operators produce the correct S3 Vectors filter.""" + cases = [ + ({"field": "meta.x", "operator": "==", "value": "a"}, {"x": {"$eq": "a"}}), + ({"field": "meta.x", "operator": "!=", "value": "a"}, {"x": {"$ne": "a"}}), + ({"field": "meta.x", "operator": ">", "value": 1}, {"x": {"$gt": 1}}), + ({"field": "meta.x", "operator": ">=", "value": 1}, {"x": {"$gte": 1}}), + ({"field": "meta.x", "operator": "<", "value": 1.5}, {"x": {"$lt": 1.5}}), + ({"field": "meta.x", "operator": "<=", "value": 1.5}, {"x": {"$lte": 1.5}}), + ({"field": "meta.x", "operator": "in", "value": [1, 2]}, {"x": {"$in": [1, 2]}}), + ({"field": "meta.x", "operator": "not in", "value": ["a"]}, {"x": {"$nin": ["a"]}}), + ] + for haystack_filter, expected in cases: + assert _normalize_filters(haystack_filter) == expected + + +def test_logical_operators(): + """AND/OR produce $and/$or with nested conditions.""" + assert _normalize_filters( + { + "operator": "AND", + "conditions": [ + {"field": "meta.a", "operator": "==", "value": 1}, + {"field": "meta.b", "operator": ">", "value": 2}, + ], + } + ) == {"$and": [{"a": {"$eq": 1}}, {"b": {"$gt": 2}}]} + + assert _normalize_filters( + { + "operator": "OR", + "conditions": [ + {"field": "meta.a", "operator": "==", "value": "x"}, + {"field": "meta.a", "operator": "==", "value": "y"}, + ], + } + ) == {"$or": [{"a": {"$eq": "x"}}, {"a": {"$eq": "y"}}]} + + +def test_meta_prefix_stripped(): + """The meta. prefix is stripped since S3 Vectors stores metadata flat.""" + assert _normalize_filters({"field": "meta.category", "operator": "==", "value": "news"}) == { + "category": {"$eq": "news"} + } + # Fields without meta. prefix also work + assert _normalize_filters({"field": "category", "operator": "==", "value": "news"}) == {"category": {"$eq": "news"}} + + +def test_unsupported_type_raises(): + with pytest.raises(FilterError): + _normalize_filters({"field": "meta.x", "operator": "==", "value": [1, 2, 3]}) + + +def test_in_requires_list(): + with pytest.raises(FilterError): + _normalize_filters({"field": "meta.x", "operator": "in", "value": "not-a-list"}) + + +def test_missing_keys_raise(): + with pytest.raises(FilterError): + _normalize_filters({"field": "meta.x", "value": "a"}) # missing operator + with pytest.raises(FilterError): + _normalize_filters({"field": "meta.x", "operator": "=="}) # missing value + with pytest.raises(FilterError): + _normalize_filters("not a dict") + + +def test_validate_filters(): + _validate_filters(None) # None is valid + _validate_filters({"operator": "AND", "conditions": []}) # valid structure + with pytest.raises(ValueError, match="Invalid filter syntax"): + _validate_filters({"field": "meta.x"}) # missing operator/conditions diff --git a/integrations/amazon_s3_vectors/tests/test_integration.py b/integrations/amazon_s3_vectors/tests/test_integration.py new file mode 100644 index 0000000000..2ec71353f4 --- /dev/null +++ b/integrations/amazon_s3_vectors/tests/test_integration.py @@ -0,0 +1,220 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +""" +Integration tests for S3VectorsDocumentStore. + +Requires valid AWS credentials with s3vectors permissions. +Run with: hatch run test:integration +""" + +import os +import time +import uuid + +import pytest +from haystack.dataclasses import Document +from haystack.document_stores.errors import DocumentStoreError +from haystack.document_stores.types import DuplicatePolicy + +from haystack_integrations.components.retrievers.amazon_s3_vectors import S3VectorsEmbeddingRetriever +from haystack_integrations.document_stores.amazon_s3_vectors import S3VectorsDocumentStore + + +def _aws_credentials_available() -> bool: + """Check if AWS credentials are available via any mechanism.""" + # Explicit env vars + if os.environ.get("AWS_ACCESS_KEY_ID") or os.environ.get("AWS_PROFILE") or os.environ.get("AWS_ROLE_ARN"): + return True + # Fallback: try to resolve credentials from the default chain + try: + import boto3 + + session = boto3.Session() + credentials = session.get_credentials() + return credentials is not None + except Exception: + return False + + +# Guard: skip all integration tests when AWS credentials are not available +pytestmark = pytest.mark.skipif( + not _aws_credentials_available(), + reason="AWS credentials not configured", +) + +# Use a small dimension to keep payloads light +DIMENSION = 4 +REGION = "us-east-1" + + +def _random_name(prefix: str = "haystack-test") -> str: + return f"{prefix}-{uuid.uuid4().hex[:8]}" + + +def _make_doc(doc_id: str, content: str, embedding: list[float] | None = None, meta: dict | None = None) -> Document: + return Document( + id=doc_id, + content=content, + embedding=embedding or [0.1] * DIMENSION, + meta=meta or {}, + ) + + +@pytest.fixture(scope="module") +def doc_store(): + """Create a real S3 Vectors document store with a unique bucket + index, tear down after.""" + bucket_name = _random_name("hs-integ") + index_name = _random_name("idx") + + store = S3VectorsDocumentStore( + vector_bucket_name=bucket_name, + index_name=index_name, + dimension=DIMENSION, + distance_metric="cosine", + region_name=REGION, + create_bucket_and_index=True, + non_filterable_metadata_keys=[], + ) + # Force initialization + store._get_client() + + yield store + + # Cleanup + client = store._client + try: + client.delete_index(vectorBucketName=bucket_name, indexName=index_name) + except Exception: + pass + try: + client.delete_vector_bucket(vectorBucketName=bucket_name) + except Exception: + pass + + +@pytest.mark.integration +class TestWriteAndCount: + def test_write_and_count(self, doc_store): + docs = [ + _make_doc("int-1", "First document"), + _make_doc("int-2", "Second document"), + _make_doc("int-3", "Third document"), + ] + written = doc_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + assert written == 3 + + # S3 Vectors is eventually consistent — give it a moment + time.sleep(2) + + count = doc_store.count_documents() + assert count == 3 + + def test_overwrite(self, doc_store): + doc = _make_doc("int-1", "Updated first document", embedding=[0.9] * DIMENSION) + written = doc_store.write_documents([doc], policy=DuplicatePolicy.OVERWRITE) + assert written == 1 + + def test_skip_existing(self, doc_store): + doc = _make_doc("int-1", "Should be skipped") + written = doc_store.write_documents([doc], policy=DuplicatePolicy.SKIP) + assert written == 0 + + def test_none_policy_raises_on_existing(self, doc_store): + doc = _make_doc("int-1", "Should fail") + with pytest.raises(DocumentStoreError, match="already exist"): + doc_store.write_documents([doc], policy=DuplicatePolicy.NONE) + + def test_write_without_embedding_raises(self, doc_store): + doc = Document(id="no-emb", content="No embedding") + with pytest.raises(DocumentStoreError, match="has no embedding"): + doc_store.write_documents([doc]) + + +@pytest.mark.integration +class TestQuery: + def test_embedding_retrieval(self, doc_store): + # Query with an embedding close to [0.1, 0.1, 0.1, 0.1] + docs = doc_store._embedding_retrieval( + query_embedding=[0.1] * DIMENSION, + top_k=10, + ) + assert len(docs) > 0 + # All returned docs should have a score (cosine similarity) + for doc in docs: + assert doc.score is not None + assert doc.content is not None + + def test_embedding_retrieval_with_metadata_filter(self, doc_store): + # Write a doc with distinctive metadata + tagged_doc = _make_doc( + "int-tagged", + "Tagged document", + embedding=[0.5] * DIMENSION, + meta={"category": "special"}, + ) + doc_store.write_documents([tagged_doc], policy=DuplicatePolicy.OVERWRITE) + time.sleep(2) + + # Query with filter + filters = {"field": "meta.category", "operator": "==", "value": "special"} + docs = doc_store._embedding_retrieval( + query_embedding=[0.5] * DIMENSION, + filters=filters, + top_k=10, + ) + assert len(docs) >= 1 + assert all(d.meta.get("category") == "special" for d in docs) + + def test_retriever_component(self, doc_store): + retriever = S3VectorsEmbeddingRetriever(document_store=doc_store, top_k=5) + result = retriever.run(query_embedding=[0.1] * DIMENSION) + assert "documents" in result + assert len(result["documents"]) > 0 + + +@pytest.mark.integration +class TestFilterDocuments: + def test_filter_documents_no_filter(self, doc_store): + docs = doc_store.filter_documents() + assert len(docs) > 0 + for doc in docs: + assert doc.id is not None + + def test_filter_documents_with_filter(self, doc_store): + filters = {"field": "meta.category", "operator": "==", "value": "special"} + docs = doc_store.filter_documents(filters=filters) + assert all(d.meta.get("category") == "special" for d in docs) + + +@pytest.mark.integration +class TestDelete: + def test_delete_documents(self, doc_store): + # Write a doc to delete + doc = _make_doc("int-to-delete", "Delete me") + doc_store.write_documents([doc], policy=DuplicatePolicy.OVERWRITE) + time.sleep(2) + + doc_store.delete_documents(["int-to-delete"]) + time.sleep(2) + + # Verify it's gone by trying to get it + client = doc_store._client + response = client.get_vectors( + vectorBucketName=doc_store.vector_bucket_name, + indexName=doc_store.index_name, + keys=["int-to-delete"], + ) + assert len(response.get("vectors", [])) == 0 + + +@pytest.mark.integration +class TestSerialization: + def test_roundtrip(self, doc_store): + d = doc_store.to_dict() + restored = S3VectorsDocumentStore.from_dict(d) + assert restored.vector_bucket_name == doc_store.vector_bucket_name + assert restored.index_name == doc_store.index_name + assert restored.dimension == doc_store.dimension + assert restored.distance_metric == doc_store.distance_metric