diff --git a/.github/labeler.yml b/.github/labeler.yml index 30181024e7..20cb6edf9f 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -19,6 +19,11 @@ integration:anthropic: - any-glob-to-any-file: "integrations/anthropic/**/*" - any-glob-to-any-file: ".github/workflows/anthropic.yml" +integration:arcadedb: + - changed-files: + - any-glob-to-any-file: "integrations/arcadedb/**/*" + - any-glob-to-any-file: ".github/workflows/arcadedb.yml" + integration:astra: - changed-files: - any-glob-to-any-file: "integrations/astra/**/*" diff --git a/.github/workflows/arcadedb.yml b/.github/workflows/arcadedb.yml new file mode 100644 index 0000000000..58c3cb0f31 --- /dev/null +++ b/.github/workflows/arcadedb.yml @@ -0,0 +1,84 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / arcadedb + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "integrations/arcadedb/**" + - "!integrations/arcadedb/*.md" + - ".github/workflows/arcadedb.yml" + +concurrency: + group: arcadedb-${{ github.head_ref }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + ARCADEDB_USERNAME: "root" + # Only set in main repo (secrets not passed to fork workflows); integration tests skip when unset + ARCADEDB_PASSWORD: ${{ secrets.ARCADEDB_PASSWORD }} + +defaults: + run: + working-directory: integrations/arcadedb + +jobs: + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.10", "3.13"] + services: + arcadedb: + image: arcadedata/arcadedb:latest + env: + # Default password so container starts in forks; main repo uses secret + JAVA_OPTS: "-Darcadedb.server.rootPassword=${{ secrets.ARCADEDB_PASSWORD || 'arcadedb' }}" + ports: + - 2480:2480 + + steps: + - uses: actions/checkout@v6 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install hatch "virtualenv<21.0.0" + + - name: Lint + if: matrix.python-version == '3.10' && runner.os == 'Linux' + run: hatch run fmt-check && hatch run test:types + + - name: Run tests + run: hatch run test:cov-retry + + - name: Run unit tests with lowest direct dependencies + run: | + hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt + hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt + hatch run test:unit + + - name: Nightly - run unit tests with Haystack main branch + if: github.event_name == 'schedule' + run: | + hatch env prune + hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main + hatch run test:unit + + - name: Send event to Datadog for nightly failures + if: failure() && github.event_name == 'schedule' + uses: ./.github/actions/send_failure + with: + title: | + Core integrations nightly tests failure: ${{ github.workflow }} + api-key: ${{ secrets.CORE_DATADOG_API_KEY }} diff --git a/README.md b/README.md index 560e659cc6..267300f92c 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta | [amazon-bedrock-haystack](integrations/amazon_bedrock/) | Embedder, Generator, Ranker, Downloader | [![PyPI - Version](https://img.shields.io/pypi/v/amazon-bedrock-haystack.svg)](https://pypi.org/project/amazon-bedrock-haystack) | [![Test / amazon_bedrock](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_bedrock.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_bedrock.yml) | | [amazon-sagemaker-haystack](integrations/amazon_sagemaker/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/amazon-sagemaker-haystack.svg)](https://pypi.org/project/amazon-sagemaker-haystack) | [![Test / amazon_sagemaker](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_sagemaker.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_sagemaker.yml) | | [anthropic-haystack](integrations/anthropic/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/anthropic-haystack.svg)](https://pypi.org/project/anthropic-haystack) | [![Test / anthropic](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/anthropic.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/anthropic.yml) | +| [arcadedb-haystack](integrations/arcadedb/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/arcadedb-haystack.svg)](https://pypi.org/project/arcadedb-haystack) | [![Test / arcadedb](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/arcadedb.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/arcadedb.yml) | | [astra-haystack](integrations/astra/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/astra-haystack.svg)](https://pypi.org/project/astra-haystack) | [![Test / astra](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/astra.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/astra.yml) | | [azure-ai-search-haystack](integrations/azure_ai_search/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/azure-ai-search-haystack.svg)](https://pypi.org/project/azure-ai-search-haystack) | [![Test / azure-ai-search](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/azure_ai_search.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/azure_ai_search.yml) | | [azure-doc-intelligence-haystack](integrations/azure_doc_intelligence/) | Converter | [![PyPI - Version](https://img.shields.io/pypi/v/azure-doc-intelligence-haystack.svg)](https://pypi.org/project/azure-doc-intelligence-haystack) | [![Test / azure_doc_intelligence](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/azure_doc_intelligence.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/azure_doc_intelligence.yml) | diff --git a/integrations/arcadedb/LICENSE.txt b/integrations/arcadedb/LICENSE.txt new file mode 100644 index 0000000000..1c8582b372 --- /dev/null +++ b/integrations/arcadedb/LICENSE.txt @@ -0,0 +1,190 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to the Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by the Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding any notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2023-present deepset GmbH + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/integrations/arcadedb/README.md b/integrations/arcadedb/README.md new file mode 100644 index 0000000000..eb0ac897ca --- /dev/null +++ b/integrations/arcadedb/README.md @@ -0,0 +1,15 @@ +# arcadedb-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/arcadedb-haystack.svg)](https://pypi.org/project/arcadedb-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/arcadedb-haystack.svg)](https://pypi.org/project/arcadedb-haystack) +[![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE.txt) + +- [Integration page](https://haystack.deepset.ai/integrations/arcadedb) +- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/arcadedb/CHANGELOG.md) + +--- + +## Contributing + +Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md). + diff --git a/integrations/arcadedb/examples/embedding_retrieval.py b/integrations/arcadedb/examples/embedding_retrieval.py new file mode 100644 index 0000000000..cf539f65b9 --- /dev/null +++ b/integrations/arcadedb/examples/embedding_retrieval.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: 2025-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +""" +Example: Embedding retrieval with ArcadeDB + Haystack. + +Prerequisites: + docker run -d -p 2480:2480 \ + -e JAVA_OPTS="-Darcadedb.server.rootPassword=arcadedb" \ + arcadedata/arcadedb:latest + + pip install arcadedb-haystack + +Usage: + export ARCADEDB_USERNAME=root + export ARCADEDB_PASSWORD=arcadedb + python examples/embedding_retrieval.py +""" + +from haystack import Document, Pipeline +from haystack.document_stores.types import DuplicatePolicy + +from haystack_integrations.components.retrievers.arcadedb import ArcadeDBEmbeddingRetriever +from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore + +# --- 1. Create the document store --- +store = ArcadeDBDocumentStore( + url="http://localhost:2480", + database="haystack_example", + embedding_dimension=4, # small dim for demo + similarity_function="cosine", + recreate_type=True, +) + +# --- 2. Write some documents --- +documents = [ + Document( + content="ArcadeDB is a multi-model database supporting graphs, documents, key-value, time-series, and vectors.", + embedding=[1.0, 0.0, 0.0, 0.0], + meta={"category": "database", "source": "docs"}, + ), + Document( + content="Haystack is an open-source framework for building RAG pipelines.", + embedding=[0.0, 1.0, 0.0, 0.0], + meta={"category": "framework", "source": "docs"}, + ), + Document( + content="HNSW (Hierarchical Navigable Small World) enables fast approximate nearest neighbor search.", + embedding=[0.5, 0.5, 0.0, 0.0], + meta={"category": "algorithm", "source": "paper"}, + ), + Document( + content="Vector databases store high-dimensional embeddings for semantic search.", + embedding=[0.8, 0.2, 0.0, 0.0], + meta={"category": "database", "source": "blog"}, + ), +] + +written = store.write_documents(documents, policy=DuplicatePolicy.OVERWRITE) +print(f"Wrote {written} documents") +print(f"Total documents: {store.count_documents()}") + +# --- 3. Build a retrieval pipeline --- +pipeline = Pipeline() +pipeline.add_component("retriever", ArcadeDBEmbeddingRetriever(document_store=store, top_k=3)) + +# --- 4. Run a similarity search --- +query_embedding = [0.9, 0.1, 0.0, 0.0] # close to "ArcadeDB" and "Vector databases" +result = pipeline.run({"retriever": {"query_embedding": query_embedding}}) + +print("\n--- Top 3 results ---") +for doc in result["retriever"]["documents"]: + print(f" score={doc.score:.4f} category={doc.meta.get('category')} content={doc.content[:80]}...") + +# --- 5. Filter retrieval (only 'database' category) --- +result_filtered = pipeline.run( + { + "retriever": { + "query_embedding": query_embedding, + "filters": {"field": "meta.category", "operator": "==", "value": "database"}, + } + } +) + +print("\n--- Filtered (category=database) ---") +for doc in result_filtered["retriever"]["documents"]: + print(f" score={doc.score:.4f} content={doc.content[:80]}...") diff --git a/integrations/arcadedb/pydoc/config_docusaurus.yml b/integrations/arcadedb/pydoc/config_docusaurus.yml new file mode 100644 index 0000000000..e3cb005d73 --- /dev/null +++ b/integrations/arcadedb/pydoc/config_docusaurus.yml @@ -0,0 +1,14 @@ +loaders: + - modules: + - haystack_integrations.components.retrievers.arcadedb.embedding_retriever + - haystack_integrations.document_stores.arcadedb.document_store + search_path: [../src] +processors: + - type: filter + documented_only: true + skip_empty_modules: true +renderer: + description: ArcadeDB integration for Haystack + id: integrations-arcadedb + filename: arcadedb.md + title: ArcadeDB diff --git a/integrations/arcadedb/pyproject.toml b/integrations/arcadedb/pyproject.toml new file mode 100644 index 0000000000..d79b975003 --- /dev/null +++ b/integrations/arcadedb/pyproject.toml @@ -0,0 +1,119 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "arcadedb-haystack" +dynamic = ["version"] +description = "An integration of ArcadeDB with Haystack — document storage + HNSW vector search + SQL filtering" +readme = "README.md" +requires-python = ">=3.10" +license = "Apache-2.0" +keywords = ["arcadedb", "haystack", "vector-search", "document-store", "rag"] +authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }, { name = "ArcadeData Ltd", email = "info@arcadedb.com" }] +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [ + "haystack-ai>=2.9.0", + "requests>=2.28.0", +] + +[project.urls] +Source = "https://github.com/deepset-ai/haystack-core-integrations" +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/arcadedb/README.md" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + +[tool.hatch.version] +source = "vcs" +tag-pattern = 'integrations\/arcadedb-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/arcadedb-v[0-9]*"' + +[tool.hatch.envs.default] +installer = "uv" +dependencies = ["haystack-pydoc-tools", "ruff"] + +[tool.hatch.envs.default.scripts] +docs = ["haystack-pydoc pydoc/config_docusaurus.yml"] +fmt = "ruff check --fix {args}; ruff format {args}" +fmt-check = "ruff check {args} && ruff format --check {args}" + +[tool.hatch.envs.test] +dependencies = [ + "pytest", + "pytest-cov", + "pytest-rerunfailures", + "mypy", + "pip", +] + +[tool.hatch.envs.test.scripts] +unit = 'pytest -m "not integration" {args:tests}' +integration = 'pytest -m "integration" {args:tests}' +all = 'pytest {args:tests}' +cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x {args:tests}' +types = "mypy -p haystack_integrations.document_stores.arcadedb -p haystack_integrations.components.retrievers.arcadedb {args}" + +[tool.mypy] +install_types = true +non_interactive = true +check_untyped_defs = true +disallow_incomplete_defs = true + +[[tool.mypy.overrides]] +module = ["requests.*"] +ignore_missing_imports = true + +[tool.ruff] +line-length = 120 + +[tool.ruff.lint] +select = [ + "A", "ARG", "B", "C", "DTZ", "E", "EM", "F", "FBT", "I", "ICN", + "ISC", "N", "PLC", "PLE", "PLR", "PLW", "Q", "RUF", "S", "T", + "TID", "UP", "W", "YTT", +] +ignore = [ + "B027", "FBT003", "S105", "S106", "S107", + "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", + "B008", "S101", + # SQL string construction is intentional — ArcadeDB uses HTTP/JSON API with value escaping + "S608", +] + +[tool.ruff.lint.isort] +known-first-party = ["haystack_integrations"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "parents" + +[tool.ruff.lint.per-file-ignores] +"tests/**/*" = ["PLR2004", "S101", "TID252"] +"examples/**/*" = ["T201"] + +[tool.coverage.run] +source = ["haystack_integrations"] +branch = true +parallel = false + +[tool.coverage.report] +omit = ["*/tests/*", "*/__init__.py"] +show_missing = true +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] + +[tool.pytest.ini_options] +markers = ["integration: integration tests"] diff --git a/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/__init__.py b/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/__init__.py new file mode 100644 index 0000000000..da774b7d53 --- /dev/null +++ b/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2025-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from haystack_integrations.components.retrievers.arcadedb.embedding_retriever import ArcadeDBEmbeddingRetriever + +__all__ = ["ArcadeDBEmbeddingRetriever"] diff --git a/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py b/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py new file mode 100644 index 0000000000..ba36a58e4c --- /dev/null +++ b/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py @@ -0,0 +1,134 @@ +# SPDX-FileCopyrightText: 2025-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any + +from haystack import Document, component, default_from_dict, default_to_dict +from haystack.document_stores.types import FilterPolicy + +from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore + + +@component +class ArcadeDBEmbeddingRetriever: + """ + Retrieve documents from ArcadeDB using vector similarity (LSM_VECTOR / HNSW index). + + Usage example: + + ```python + from haystack import Document + from haystack.components.embedders import SentenceTransformersTextEmbedder + from haystack_integrations.components.retrievers.arcadedb import ArcadeDBEmbeddingRetriever + from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore + + store = ArcadeDBDocumentStore(database="mydb") + retriever = ArcadeDBEmbeddingRetriever(document_store=store, top_k=5) + + # Add documents to DocumentStore + documents = [ + Document(text="My name is Carla and I live in Berlin"), + Document(text="My name is Paul and I live in New York"), + Document(text="My name is Silvano and I live in Matera"), + Document(text="My name is Usagi Tsukino and I live in Tokyo"), + ] + document_store.write_documents(documents) + + embedder = SentenceTransformersTextEmbedder() + query_embeddings = embedder.run("Who lives in Berlin?")["embedding"] + + result = retriever.run(query=query_embeddings) + for doc in result["documents"]: + print(doc.content) + ``` + """ + + def __init__( + self, + *, + document_store: ArcadeDBDocumentStore, + filters: dict[str, Any] | None = None, + top_k: int = 10, + filter_policy: FilterPolicy = FilterPolicy.REPLACE, + ): + """ + Create an ArcadeDBEmbeddingRetriever. + + :param document_store: An instance of ``ArcadeDBDocumentStore``. + :param filters: Default filters applied to every retrieval call. + :param top_k: Maximum number of documents to return. + :param filter_policy: How runtime filters interact with default filters. + """ + self._document_store = document_store + self._filters = filters + self._top_k = top_k + self._filter_policy = filter_policy + + @component.output_types(documents=list[Document]) + def run( + self, + query_embedding: list[float], + filters: dict[str, Any] | None = None, + top_k: int | None = None, + ) -> dict[str, list[Document]]: + """ + Retrieve documents by vector similarity. + + :param query_embedding: The embedding vector to search with. + :param filters: Optional filters to narrow results. + :param top_k: Maximum number of documents to return. + :returns: A dictionary with the following keys: + - `documents`: List of `Document`s most similar to the given `query_embedding` + """ + effective_top_k = top_k if top_k is not None else self._top_k + + effective_filters: dict[str, Any] | None + if self._filter_policy == FilterPolicy.REPLACE and filters is not None: + effective_filters = filters + elif self._filter_policy == FilterPolicy.MERGE and filters is not None and self._filters is not None: + effective_filters = { + "operator": "AND", + "conditions": [self._filters, filters], + } + else: + effective_filters = filters or self._filters + + documents = self._document_store._embedding_retrieval( + query_embedding=query_embedding, + filters=effective_filters, + top_k=effective_top_k, + ) + return {"documents": documents} + + def to_dict(self) -> dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + document_store=self._document_store.to_dict(), + filters=self._filters, + top_k=self._top_k, + filter_policy=self._filter_policy.value, + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "ArcadeDBEmbeddingRetriever": + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ + init_params = data.get("init_parameters", {}) + if "document_store" in init_params: + init_params["document_store"] = ArcadeDBDocumentStore.from_dict(init_params["document_store"]) + if "filter_policy" in init_params: + init_params["filter_policy"] = FilterPolicy(init_params["filter_policy"]) + return default_from_dict(cls, data) diff --git a/integrations/arcadedb/src/haystack_integrations/components/retrievers/py.typed b/integrations/arcadedb/src/haystack_integrations/components/retrievers/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/__init__.py b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/__init__.py new file mode 100644 index 0000000000..d72ad983c9 --- /dev/null +++ b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2025-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from haystack_integrations.document_stores.arcadedb.document_store import ArcadeDBDocumentStore + +__all__ = ["ArcadeDBDocumentStore"] diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/converters.py b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/converters.py new file mode 100644 index 0000000000..61be17a587 --- /dev/null +++ b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/converters.py @@ -0,0 +1,38 @@ +# SPDX-FileCopyrightText: 2025-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +"""Convert between Haystack Documents and ArcadeDB records.""" + +from typing import Any + +from haystack import Document + + +def _from_haystack_to_arcadedb(documents: list[Document]) -> list[dict[str, Any]]: + """Convert Haystack Documents to dicts suitable for ArcadeDB INSERT.""" + records = [] + for doc in documents: + record: dict[str, Any] = { + "id": doc.id, + "content": doc.content, + "embedding": doc.embedding, + "meta": doc.meta, + } + records.append(record) + return records + + +def _from_arcadedb_to_haystack(records: list[dict[str, Any]]) -> list[Document]: + """Convert ArcadeDB query result rows to Haystack Documents.""" + documents = [] + for record in records: + doc = Document( + id=record["id"], + content=record.get("content"), + embedding=record.get("embedding"), + meta=record.get("meta") or {}, + score=record.get("score"), + ) + documents.append(doc) + return documents diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py new file mode 100644 index 0000000000..fb5941a169 --- /dev/null +++ b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py @@ -0,0 +1,423 @@ +# SPDX-FileCopyrightText: 2025-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +"""ArcadeDB DocumentStore for Haystack 2.x — document storage + vector search via HTTP/JSON API.""" + +import logging +from http import HTTPStatus +from typing import Any, ClassVar + +import requests +from haystack import Document, default_from_dict, default_to_dict +from haystack.document_stores.errors import DuplicateDocumentError +from haystack.document_stores.types import DuplicatePolicy +from haystack.errors import FilterError +from haystack.utils import Secret + +from haystack_integrations.document_stores.arcadedb.converters import ( + _from_arcadedb_to_haystack, + _from_haystack_to_arcadedb, +) +from haystack_integrations.document_stores.arcadedb.filters import _convert_filters + +logger = logging.getLogger(__name__) + + +class ArcadeDBDocumentStore: + """ + An ArcadeDB-backed DocumentStore for Haystack 2.x. + + Uses ArcadeDB's HTTP/JSON API for all operations — no special drivers required. + Supports HNSW vector search (LSM_VECTOR) and SQL metadata filtering. + + Usage example: + + ```python + from haystack.dataclasses.document import Document + from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore + + document_store = ArcadeDBDocumentStore( + url="http://localhost:2480", + database="haystack", + embedding_dimension=768, + ) + document_store.write_documents([ + Document(content="This is first", embedding=[0.0]*5), + Document(content="This is second", embedding=[0.1, 0.2, 0.3, 0.4, 0.5]) + ]) + ``` + """ + + # Map user-facing similarity names to ArcadeDB LSM_VECTOR metric keywords + _SIMILARITY_MAP: ClassVar[dict[str, str]] = { + "cosine": "COSINE", + "euclidean": "EUCLIDEAN", + "dot": "DOT_PRODUCT", + } + + def __init__( + self, + *, + url: str = "http://localhost:2480", + database: str = "haystack", + username: Secret = Secret.from_env_var("ARCADEDB_USERNAME", strict=False), + password: Secret = Secret.from_env_var("ARCADEDB_PASSWORD", strict=False), + type_name: str = "Document", + embedding_dimension: int = 768, + similarity_function: str = "cosine", + recreate_type: bool = False, + create_database: bool = True, + ): + """ + Create an ArcadeDBDocumentStore instance. + + :param url: ArcadeDB HTTP endpoint. + :param database: Database name. + :param username: HTTP Basic Auth username (default: ``ARCADEDB_USERNAME`` env var). + :param password: HTTP Basic Auth password (default: ``ARCADEDB_PASSWORD`` env var). + :param type_name: Vertex type name for documents. + :param embedding_dimension: Vector dimension for the HNSW index. + :param similarity_function: Distance metric — ``"cosine"``, ``"euclidean"``, or ``"dot"``. + :param recreate_type: If ``True``, drop and recreate the type on initialization. + :param create_database: If ``True``, create the database if it doesn't exist. + """ + self._url = url.rstrip("/") + self._database = database + self._username = username + self._password = password + self._type_name = type_name + self._embedding_dimension = embedding_dimension + self._similarity_function = similarity_function + self._recreate_type = recreate_type + self._create_database = create_database + + self._session = requests.Session() + self._initialized = False + + def to_dict(self) -> dict[str, Any]: + """ + Serializes the DocumentStore to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + url=self._url, + database=self._database, + username=self._username.to_dict() if self._username else None, + password=self._password.to_dict() if self._password else None, + type_name=self._type_name, + embedding_dimension=self._embedding_dimension, + similarity_function=self._similarity_function, + recreate_type=self._recreate_type, + create_database=self._create_database, + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "ArcadeDBDocumentStore": + """ + Deserializes the DocumentStore from a dictionary. + + :param data: + The dictionary to deserialize from. + :returns: + The deserialized DocumentStore. + """ + init_params = data.get("init_parameters", {}) + for key in ("username", "password"): + if init_params.get(key) is not None: + init_params[key] = Secret.from_dict(init_params[key]) + return default_from_dict(cls, data) + + # ------------------------------------------------------------------ + # HTTP helpers + # ------------------------------------------------------------------ + + def _auth(self) -> tuple[str, str] | None: + user = self._username.resolve_value() if self._username else None + pwd = self._password.resolve_value() if self._password else None + if user and pwd: + return (user, pwd) + return None + + def _command(self, sql: str, *, positional_params: list[Any] | None = None) -> list[dict[str, Any]]: + """Execute an SQL command via the ArcadeDB HTTP API and return result rows.""" + url = f"{self._url}/api/v1/command/{self._database}" + payload: dict[str, Any] = {"language": "sql", "command": sql} + if positional_params: + payload["params"] = positional_params + + resp = self._session.post(url, json=payload, auth=self._auth()) + if resp.status_code >= HTTPStatus.BAD_REQUEST: + msg = f"ArcadeDB command failed ({resp.status_code}): {resp.text}" + raise RuntimeError(msg) + + body = resp.json() + return body.get("result", []) + + def _server_command(self, command: str) -> dict[str, Any]: + """Execute a server-level command (e.g. CREATE DATABASE).""" + url = f"{self._url}/api/v1/server" + resp = self._session.post(url, json={"command": command}, auth=self._auth()) + if resp.status_code >= HTTPStatus.BAD_REQUEST: + msg = f"ArcadeDB server command failed ({resp.status_code}): {resp.text}" + raise RuntimeError(msg) + return resp.json() + + def _ensure_initialized(self) -> None: + if self._initialized: + return + + # 1. Optionally create the database + if self._create_database: + try: + self._server_command(f"CREATE DATABASE {self._database}") + logger.info("Created database '%s'", self._database) + except RuntimeError: + logger.debug("Database '%s' already exists or cannot be created", self._database) + + # 2. Optionally drop existing type + if self._recreate_type: + try: + self._command(f"DROP TYPE `{self._type_name}` IF EXISTS UNSAFE") + except RuntimeError: + pass + + # 3. Create vertex type + properties + self._command(f"CREATE VERTEX TYPE `{self._type_name}` IF NOT EXISTS") + self._command(f"CREATE PROPERTY `{self._type_name}`.id IF NOT EXISTS STRING") + self._command(f"CREATE PROPERTY `{self._type_name}`.content IF NOT EXISTS STRING") + self._command(f"CREATE PROPERTY `{self._type_name}`.embedding IF NOT EXISTS ARRAY_OF_FLOATS") + self._command(f"CREATE PROPERTY `{self._type_name}`.meta IF NOT EXISTS MAP") + + # 4. Unique index on id + try: + self._command(f"CREATE INDEX ON `{self._type_name}` (id) UNIQUE") + except RuntimeError: + logger.debug("Unique index on id already exists") + + # 5. LSM_VECTOR index on embedding (HNSW-based, ACID-compliant) + metric = self._SIMILARITY_MAP.get(self._similarity_function, "COSINE") + try: + self._command( + f"CREATE INDEX IF NOT EXISTS ON `{self._type_name}` (embedding) LSM_VECTOR " + f"METADATA {{ dimensions: {self._embedding_dimension}, similarity: '{metric}' }}" + ) + except RuntimeError: + logger.debug("Vector index on embedding already exists") + + self._initialized = True + logger.info( + "ArcadeDBDocumentStore initialized: database=%s, type=%s, dim=%d, metric=%s", + self._database, + self._type_name, + self._embedding_dimension, + metric, + ) + + # ------------------------------------------------------------------ + # DocumentStore protocol + # ------------------------------------------------------------------ + + def count_documents(self) -> int: + """ + Returns how many documents are present in the document store. + + :returns: + Number of documents in the document store. + """ + self._ensure_initialized() + rows = self._command(f"SELECT count(*) AS cnt FROM `{self._type_name}`") + if rows: + return int(rows[0].get("cnt", 0)) + return 0 + + def filter_documents( + self, + filters: dict[str, Any] | None = None, + ) -> list[Document]: + """ + Return documents matching the given filters. + + :param filters: Haystack filter dictionary. + :returns: List of matching documents. + """ + self._ensure_initialized() + try: + where = _convert_filters(filters) + except ValueError as e: + raise FilterError(str(e)) from e + sql = f"SELECT * FROM `{self._type_name}`" + if where: + sql += f" WHERE {where}" + rows = self._command(sql) + return _from_arcadedb_to_haystack(rows) + + def write_documents( + self, + documents: list[Document], + policy: DuplicatePolicy = DuplicatePolicy.NONE, + ) -> int: + """ + Write documents to the store. + + :param documents: List of Haystack Documents to write. + :param policy: How to handle duplicate document IDs. + :returns: Number of documents written. + """ + self._ensure_initialized() + msg = "documents must be a list of Document objects" + if not isinstance(documents, list): + raise ValueError(msg) + for doc in documents: + if not isinstance(doc, Document): + raise ValueError(msg) + if not documents: + return 0 + + records = _from_haystack_to_arcadedb(documents) + written = 0 + + for record in records: + emb = record["embedding"] + if emb is None or not isinstance(emb, list) or len(emb) != self._embedding_dimension: + emb = [0.0] * self._embedding_dimension + embedding_str = str(emb) + meta_str = _map_literal(record["meta"]) if record["meta"] else "{}" + + if policy == DuplicatePolicy.OVERWRITE: + sql = ( + f"UPDATE `{self._type_name}` SET " + f"content = {_sql_str(record['content'])}, " + f"embedding = {embedding_str}, " + f"meta = {meta_str} " + f"WHERE id = {_sql_str(record['id'])}" + ) + result = self._command(sql) + updated = int(result[0].get("count", 0)) if result else 0 + if updated == 0: + self._insert_record(record, embedding_str, meta_str) + written += 1 + + elif policy == DuplicatePolicy.SKIP: + existing = self._command(f"SELECT id FROM `{self._type_name}` WHERE id = {_sql_str(record['id'])}") + if existing: + continue + self._insert_record(record, embedding_str, meta_str) + written += 1 + + else: + # DuplicatePolicy.NONE — raise on duplicate + existing = self._command(f"SELECT id FROM `{self._type_name}` WHERE id = {_sql_str(record['id'])}") + if existing: + msg = f"Document with id '{record['id']}' already exists." + raise DuplicateDocumentError(msg) + self._insert_record(record, embedding_str, meta_str) + written += 1 + + return written + + def _insert_record(self, record: dict[str, Any], embedding_str: str, meta_str: str) -> None: + sql = ( + f"INSERT INTO `{self._type_name}` SET " + f"id = {_sql_str(record['id'])}, " + f"content = {_sql_str(record['content'])}, " + f"embedding = {embedding_str}, " + f"meta = {meta_str}" + ) + self._command(sql) + + def delete_documents(self, document_ids: list[str]) -> None: + """ + Delete documents by their IDs. + + :param document_ids: List of document IDs to delete. + """ + self._ensure_initialized() + if not document_ids: + return + ids_str = ", ".join(_sql_str(did) for did in document_ids) + self._command(f"DELETE FROM `{self._type_name}` WHERE id IN [{ids_str}]") + + # ------------------------------------------------------------------ + # Retrieval (called by Retriever components) + # ------------------------------------------------------------------ + + def _embedding_retrieval( + self, + query_embedding: list[float], + *, + filters: dict[str, Any] | None = None, + top_k: int = 10, + ) -> list[Document]: + """ + Retrieve documents by vector similarity using ArcadeDB's LSM_VECTOR index. + + :param query_embedding: The embedding vector to search with. + :param filters: Optional metadata filters (applied as post-filter). + :param top_k: Maximum number of documents to return. + :returns: Documents ordered by descending similarity score. + """ + self._ensure_initialized() + embedding_str = str(query_embedding) + + # vectorNeighbors returns a single row with a "neighbors" list of {record, distance} + sql = f"SELECT vectorNeighbors('{self._type_name}[embedding]', {embedding_str}, {top_k}) AS neighbors" + rows = self._command(sql) + if not rows or not rows[0].get("neighbors"): + return [] + + neighbors = rows[0]["neighbors"] + where = _convert_filters(filters) + + documents = [] + for neighbor in neighbors: + record = neighbor.get("record", {}) + distance = neighbor.get("distance", 0.0) + score = 1.0 - distance + + doc = Document( + id=record.get("id", ""), + content=record.get("content"), + meta=record.get("meta") or {}, + score=score, + ) + documents.append(doc) + + # Post-filter by metadata if specified + if where and filters: + filtered_ids = {r["id"] for r in self._command(f"SELECT id FROM `{self._type_name}` WHERE {where}")} + documents = [d for d in documents if d.id in filtered_ids] + + return documents + + +def _sql_str(value: str | None) -> str: + """Escape and quote a string value for ArcadeDB SQL.""" + if value is None: + return "NULL" + escaped = value.replace("\\", "\\\\").replace("'", "\\'") + return f"'{escaped}'" + + +def _map_literal(meta: dict[str, Any]) -> str: + """Build an ArcadeDB MAP literal from a Python dict.""" + if not meta: + return "{}" + pairs = [] + for key, value in meta.items(): + if isinstance(value, str): + pairs.append(f'"{key}": {_sql_str(value)}') + elif isinstance(value, bool): + pairs.append(f'"{key}": {"true" if value else "false"}') + elif isinstance(value, (int, float)): + pairs.append(f'"{key}": {value}') + elif value is None: + pairs.append(f'"{key}": NULL') + elif isinstance(value, list): + pairs.append(f'"{key}": {value}') + else: + pairs.append(f'"{key}": {_sql_str(str(value))}') + return "{" + ", ".join(pairs) + "}" diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/filters.py b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/filters.py new file mode 100644 index 0000000000..d089898a58 --- /dev/null +++ b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/filters.py @@ -0,0 +1,117 @@ +# SPDX-FileCopyrightText: 2025-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +"""Convert Haystack filter dictionaries to ArcadeDB SQL WHERE clauses.""" + +from typing import Any + + +def _convert_filters(filters: dict[str, Any] | None) -> str: + """ + Convert a Haystack filter dictionary to an ArcadeDB SQL WHERE clause. + + Supports comparison operators (==, !=, >, >=, <, <=, in, not in) + and logical operators (AND, OR, NOT). + """ + if not filters: + return "" + return _parse_condition(filters) + + +def _parse_condition(condition: dict[str, Any]) -> str: + operator = condition.get("operator") + if not operator: + msg = f"Missing 'operator' in filter condition: {condition}" + raise ValueError(msg) + + operator_upper = operator.upper() + + if operator_upper in ("AND", "OR"): + if "conditions" not in condition: + msg = f"Missing 'conditions' in filter: {condition}" + raise ValueError(msg) + conditions = condition.get("conditions", []) + if not conditions: + return "" + parts = [_parse_condition(c) for c in conditions] + parts = [p for p in parts if p] + if not parts: + return "" + if len(parts) == 1: + return parts[0] + joiner = f" {operator_upper} " + return f"({joiner.join(parts)})" + + if operator_upper == "NOT": + conditions = condition.get("conditions", []) + if not conditions: + return "" + inner = _parse_condition(conditions[0]) + return f"NOT ({inner})" if inner else "" + + field = condition.get("field") + value = condition.get("value") + + if not field: + msg = f"Missing 'field' in filter condition: {condition}" + raise ValueError(msg) + if "value" not in condition: + msg = f"Missing 'value' in filter condition: {condition}" + raise ValueError(msg) + + return _comparison_to_sql(field, operator, value) + + +def _comparison_to_sql(field: str, operator: str, value: Any) -> str: + if operator == "==": + if value is None: + return f"{field} IS NULL" + return f"{field} = {_sql_value(value)}" + + if operator == "!=": + if value is None: + return f"{field} IS NOT NULL" + return f"{field} <> {_sql_value(value)}" + + if operator in (">", ">=", "<", "<="): + if value is None: + return "1 = 0" + if isinstance(value, list): + msg = "Comparison operators require numeric or datetime values, not list" + raise ValueError(msg) + if isinstance(value, str) and "T" not in value: + msg = "Comparison operators require numeric or datetime (ISO) values, not plain string" + raise ValueError(msg) + return f"{field} {operator} {_sql_value(value)}" + + if operator == "in": + if not isinstance(value, list): + msg = "Operator 'in' requires value to be a list" + raise ValueError(msg) + values = ", ".join(_sql_value(v) for v in value) + return f"{field} IN [{values}]" + + if operator == "not in": + if not isinstance(value, list): + msg = "Operator 'not in' requires value to be a list" + raise ValueError(msg) + values = ", ".join(_sql_value(v) for v in value) + return f"{field} NOT IN [{values}]" + + msg = f"Unsupported filter operator: {operator}" + raise ValueError(msg) + + +def _sql_value(value: Any) -> str: + """Format a Python value as an ArcadeDB SQL literal.""" + if isinstance(value, str): + escaped = value.replace("'", "\\'") + return f"'{escaped}'" + if isinstance(value, bool): + return "true" if value else "false" + if isinstance(value, (int, float)): + return str(value) + if value is None: + return "NULL" + return f"'{value}'" diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/py.typed b/integrations/arcadedb/src/haystack_integrations/document_stores/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/arcadedb/tests/__init__.py b/integrations/arcadedb/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/arcadedb/tests/conftest.py b/integrations/arcadedb/tests/conftest.py new file mode 100644 index 0000000000..93da0a28a0 --- /dev/null +++ b/integrations/arcadedb/tests/conftest.py @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: 2025-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +"""Pytest fixtures for ArcadeDB integration tests.""" + +import os + +import pytest + +from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore + +ARCADEDB_URL = os.getenv("ARCADEDB_URL", "http://localhost:2480") + + +@pytest.fixture +def document_store(): + """ + ArcadeDB document store instance for integration tests. + + """ + store = ArcadeDBDocumentStore( + url=ARCADEDB_URL, + database="haystack_test", + embedding_dimension=768, + recreate_type=True, + ) + return store diff --git a/integrations/arcadedb/tests/test_document_store.py b/integrations/arcadedb/tests/test_document_store.py new file mode 100644 index 0000000000..fb0f0ca4a8 --- /dev/null +++ b/integrations/arcadedb/tests/test_document_store.py @@ -0,0 +1,123 @@ +# SPDX-FileCopyrightText: 2025-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import dataclasses +import os + +import pytest +from haystack import Document +from haystack.document_stores.errors import DuplicateDocumentError +from haystack.document_stores.types import DuplicatePolicy +from haystack.testing.document_store import DocumentStoreBaseTests + +from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore + +ARCADEDB_URL = os.getenv("ARCADEDB_URL", "http://localhost:2480") + + +def _sample_docs(n: int = 3, dim: int = 4) -> list[Document]: + docs = [] + for i in range(n): + docs.append( + Document( + content=f"Document number {i}", + embedding=[float(i)] * dim, + meta={"category": "test", "priority": i}, + ) + ) + return docs + + +class TestSerialization: + def test_to_dict_from_dict(self): + store = ArcadeDBDocumentStore( + url="http://localhost:2480", + database="test_db", + embedding_dimension=4, + ) + data = store.to_dict() + restored = ArcadeDBDocumentStore.from_dict(data) + assert restored._database == store._database + assert restored._embedding_dimension == store._embedding_dimension + assert restored._url == store._url + + +@pytest.mark.skipif( + not os.environ.get("ARCADEDB_PASSWORD"), + reason="Set ARCADEDB_PASSWORD (e.g. via repo secret in CI) to run integration tests.", +) +@pytest.mark.integration +class TestArcadeDBDocumentStore(DocumentStoreBaseTests): + """ + Run Haystack DocumentStore mixin tests against ArcadeDBDocumentStore. + + Base tests cover: count_documents, delete_documents, filter_documents, write_documents. + ArcadeDB does not implement delete_all_documents, delete_by_filter, or update_by_filter, + so DocumentStoreBaseTests (not Extended) is used. + """ + + @pytest.fixture + def document_store(self, document_store: ArcadeDBDocumentStore) -> ArcadeDBDocumentStore: + """Override to provide ArcadeDB document store from conftest.""" + yield document_store + + def assert_documents_are_equal(self, received: list[Document], expected: list[Document]): + """ + Compare document lists for tests. Clear score (filter_documents does not set it; + embedding_retrieval does). Compare embeddings approximately for float round-trip. + Documents written without embeddings get zero-padded in the store; treat as None for comparison. + """ + assert len(received) == len(expected) + received = sorted(received, key=lambda x: x.id) + expected = sorted(expected, key=lambda x: x.id) + for received_doc, expected_doc in zip(received, expected, strict=True): + received_doc.score = None + if expected_doc.embedding is None: + received_doc.embedding = None + elif received_doc.embedding is None: + assert expected_doc.embedding is None + else: + assert received_doc.embedding == pytest.approx(expected_doc.embedding) + received_doc.embedding, expected_doc.embedding = None, None + assert received_doc == expected_doc + + def test_write_documents(self, document_store: ArcadeDBDocumentStore): + """Override mixin: test default write_documents and duplicate fail behaviour.""" + docs = [Document(id="1")] + assert document_store.write_documents(docs) == 1 + with pytest.raises(DuplicateDocumentError): + document_store.write_documents(docs, policy=DuplicatePolicy.FAIL) + + def test_write_overwrite(self, document_store: ArcadeDBDocumentStore): + """ArcadeDB-specific: overwrite updates content.""" + docs = _sample_docs(1) + document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + updated = dataclasses.replace(docs[0], content="Updated content") + document_store.write_documents([updated], policy=DuplicatePolicy.OVERWRITE) + + all_docs = document_store.filter_documents() + assert len(all_docs) == 1 + assert all_docs[0].content == "Updated content" + + def test_embedding_retrieval(self, document_store: ArcadeDBDocumentStore): + """ArcadeDB-specific: vector search via _embedding_retrieval.""" + # Use store's embedding_dimension (768 from conftest); create small test docs + dim = document_store._embedding_dimension + docs = [ + Document( + content=f"Document number {i}", + embedding=[float(i)] * dim, + meta={"category": "test", "priority": i}, + ) + for i in range(5) + ] + document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + results = document_store._embedding_retrieval( + query_embedding=[4.0] * dim, + top_k=3, + ) + assert len(results) <= 3 + assert results[0].score is not None diff --git a/integrations/arcadedb/tests/test_filters.py b/integrations/arcadedb/tests/test_filters.py new file mode 100644 index 0000000000..3423007c08 --- /dev/null +++ b/integrations/arcadedb/tests/test_filters.py @@ -0,0 +1,103 @@ +# SPDX-FileCopyrightText: 2025-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +"""Unit tests for filter conversion (no ArcadeDB instance required).""" + +import pytest + +from haystack_integrations.document_stores.arcadedb.filters import _convert_filters + + +class TestFilterConversion: + def test_none_returns_empty(self): + assert _convert_filters(None) == "" + + def test_equality(self): + result = _convert_filters({"field": "meta.name", "operator": "==", "value": "alice"}) + assert result == "meta.name = 'alice'" + + def test_equality_null(self): + result = _convert_filters({"field": "meta.name", "operator": "==", "value": None}) + assert result == "meta.name IS NULL" + + def test_not_equal(self): + result = _convert_filters({"field": "meta.name", "operator": "!=", "value": "bob"}) + assert result == "meta.name <> 'bob'" + + def test_not_equal_null(self): + result = _convert_filters({"field": "meta.name", "operator": "!=", "value": None}) + assert result == "meta.name IS NOT NULL" + + def test_greater_than(self): + result = _convert_filters({"field": "meta.score", "operator": ">", "value": 5}) + assert result == "meta.score > 5" + + def test_in_operator(self): + result = _convert_filters({"field": "meta.tag", "operator": "in", "value": ["a", "b"]}) + assert result == "meta.tag IN ['a', 'b']" + + def test_not_in_operator(self): + result = _convert_filters({"field": "meta.tag", "operator": "not in", "value": ["x"]}) + assert result == "meta.tag NOT IN ['x']" + + def test_and(self): + result = _convert_filters( + { + "operator": "AND", + "conditions": [ + {"field": "meta.a", "operator": "==", "value": 1}, + {"field": "meta.b", "operator": ">", "value": 2}, + ], + } + ) + assert result == "(meta.a = 1 AND meta.b > 2)" + + def test_or(self): + result = _convert_filters( + { + "operator": "OR", + "conditions": [ + {"field": "meta.x", "operator": "==", "value": "yes"}, + {"field": "meta.y", "operator": "==", "value": "no"}, + ], + } + ) + assert result == "(meta.x = 'yes' OR meta.y = 'no')" + + def test_not(self): + result = _convert_filters( + { + "operator": "NOT", + "conditions": [ + {"field": "meta.deleted", "operator": "==", "value": True}, + ], + } + ) + assert result == "NOT (meta.deleted = true)" + + def test_nested(self): + result = _convert_filters( + { + "operator": "AND", + "conditions": [ + {"field": "meta.a", "operator": "==", "value": 1}, + { + "operator": "OR", + "conditions": [ + {"field": "meta.b", "operator": "==", "value": 2}, + {"field": "meta.c", "operator": "==", "value": 3}, + ], + }, + ], + } + ) + assert result == "(meta.a = 1 AND (meta.b = 2 OR meta.c = 3))" + + def test_missing_operator_raises(self): + with pytest.raises(ValueError): + _convert_filters({"field": "x", "value": 1}) + + def test_missing_field_raises(self): + with pytest.raises(ValueError): + _convert_filters({"operator": "==", "value": 1})