From 65f804a681d0cc48df14326aad8bb8e63c7cde0a Mon Sep 17 00:00:00 2001 From: lvca Date: Sat, 28 Feb 2026 18:02:38 -0500 Subject: [PATCH 01/16] feat: add ArcadeDB document store integration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add ArcadeDBDocumentStore and ArcadeDBEmbeddingRetriever for Haystack 2.x. ArcadeDB is an open-source multi-model database that combines document storage, HNSW vector search (LSM_VECTOR), and SQL metadata filtering in a single engine. This integration connects via the HTTP/JSON API using only the requests library — no special drivers needed. Components: - ArcadeDBDocumentStore: full DocumentStore protocol (count, filter, write, delete) with automatic schema/index initialization - ArcadeDBEmbeddingRetriever: pipeline component for vector similarity retrieval with FilterPolicy support - Filter conversion: Haystack filter dicts → ArcadeDB SQL WHERE clauses - Document converters: Haystack Document ↔ ArcadeDB record mapping Includes CI workflow with ArcadeDB Docker service, unit tests for filter conversion, and integration tests for all DocumentStore operations. --- .github/labeler.yml | 5 + .github/workflows/arcadedb.yml | 82 ++++ README.md | 1 + integrations/arcadedb/LICENSE.txt | 190 +++++++++ integrations/arcadedb/README.md | 85 ++++ .../arcadedb/examples/embedding_retrieval.py | 87 ++++ .../arcadedb/pydoc/config_docusaurus.yml | 14 + integrations/arcadedb/pyproject.toml | 117 +++++ .../retrievers/arcadedb/__init__.py | 7 + .../arcadedb/embedding_retriever.py | 104 +++++ .../components/retrievers/py.typed | 0 .../document_stores/arcadedb/__init__.py | 7 + .../document_stores/arcadedb/converters.py | 38 ++ .../arcadedb/document_store.py | 400 ++++++++++++++++++ .../document_stores/arcadedb/filters.py | 106 +++++ .../document_stores/py.typed | 0 integrations/arcadedb/tests/__init__.py | 0 .../arcadedb/tests/test_document_store.py | 160 +++++++ integrations/arcadedb/tests/test_filters.py | 95 +++++ 19 files changed, 1498 insertions(+) create mode 100644 .github/workflows/arcadedb.yml create mode 100644 integrations/arcadedb/LICENSE.txt create mode 100644 integrations/arcadedb/README.md create mode 100644 integrations/arcadedb/examples/embedding_retrieval.py create mode 100644 integrations/arcadedb/pydoc/config_docusaurus.yml create mode 100644 integrations/arcadedb/pyproject.toml create mode 100644 integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/__init__.py create mode 100644 integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py create mode 100644 integrations/arcadedb/src/haystack_integrations/components/retrievers/py.typed create mode 100644 integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/__init__.py create mode 100644 integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/converters.py create mode 100644 integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py create mode 100644 integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/filters.py create mode 100644 integrations/arcadedb/src/haystack_integrations/document_stores/py.typed create mode 100644 integrations/arcadedb/tests/__init__.py create mode 100644 integrations/arcadedb/tests/test_document_store.py create mode 100644 integrations/arcadedb/tests/test_filters.py diff --git a/.github/labeler.yml b/.github/labeler.yml index 30181024e7..a4add7d750 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -4,6 +4,11 @@ integration:aimlapi: - any-glob-to-any-file: "integrations/aimlapi/**/*" - any-glob-to-any-file: ".github/workflows/aimlapi.yml" +integration:arcadedb: + - changed-files: + - any-glob-to-any-file: "integrations/arcadedb/**/*" + - any-glob-to-any-file: ".github/workflows/arcadedb.yml" + integration:amazon-bedrock: - changed-files: - any-glob-to-any-file: "integrations/amazon_bedrock/**/*" diff --git a/.github/workflows/arcadedb.yml b/.github/workflows/arcadedb.yml new file mode 100644 index 0000000000..28b57526f9 --- /dev/null +++ b/.github/workflows/arcadedb.yml @@ -0,0 +1,82 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / arcadedb + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "integrations/arcadedb/**" + - "!integrations/arcadedb/*.md" + - ".github/workflows/arcadedb.yml" + +concurrency: + group: arcadedb-${{ github.head_ref }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + ARCADEDB_USERNAME: "root" + ARCADEDB_PASSWORD: "arcadedb" + +defaults: + run: + working-directory: integrations/arcadedb + +jobs: + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest] + python-version: ["3.10", "3.13"] + services: + arcadedb: + image: arcadedata/arcadedb:latest + env: + JAVA_OPTS: "-Darcadedb.server.rootPassword=arcadedb" + ports: + - 2480:2480 + + steps: + - uses: actions/checkout@v6 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install hatch "virtualenv<21.0.0" + + - name: Lint + if: matrix.python-version == '3.10' && runner.os == 'Linux' + run: hatch run fmt-check && hatch run test:types + + - name: Run tests + run: hatch run test:cov-retry + + - name: Run unit tests with lowest direct dependencies + run: | + hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt + hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt + hatch run test:unit + + - name: Nightly - run unit tests with Haystack main branch + if: github.event_name == 'schedule' + run: | + hatch env prune + hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main + hatch run test:unit + + - name: Send event to Datadog for nightly failures + if: failure() && github.event_name == 'schedule' + uses: ./.github/actions/send_failure + with: + title: | + Core integrations nightly tests failure: ${{ github.workflow }} + api-key: ${{ secrets.CORE_DATADOG_API_KEY }} diff --git a/README.md b/README.md index 560e659cc6..267300f92c 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta | [amazon-bedrock-haystack](integrations/amazon_bedrock/) | Embedder, Generator, Ranker, Downloader | [![PyPI - Version](https://img.shields.io/pypi/v/amazon-bedrock-haystack.svg)](https://pypi.org/project/amazon-bedrock-haystack) | [![Test / amazon_bedrock](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_bedrock.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_bedrock.yml) | | [amazon-sagemaker-haystack](integrations/amazon_sagemaker/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/amazon-sagemaker-haystack.svg)](https://pypi.org/project/amazon-sagemaker-haystack) | [![Test / amazon_sagemaker](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_sagemaker.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_sagemaker.yml) | | [anthropic-haystack](integrations/anthropic/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/anthropic-haystack.svg)](https://pypi.org/project/anthropic-haystack) | [![Test / anthropic](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/anthropic.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/anthropic.yml) | +| [arcadedb-haystack](integrations/arcadedb/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/arcadedb-haystack.svg)](https://pypi.org/project/arcadedb-haystack) | [![Test / arcadedb](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/arcadedb.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/arcadedb.yml) | | [astra-haystack](integrations/astra/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/astra-haystack.svg)](https://pypi.org/project/astra-haystack) | [![Test / astra](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/astra.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/astra.yml) | | [azure-ai-search-haystack](integrations/azure_ai_search/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/azure-ai-search-haystack.svg)](https://pypi.org/project/azure-ai-search-haystack) | [![Test / azure-ai-search](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/azure_ai_search.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/azure_ai_search.yml) | | [azure-doc-intelligence-haystack](integrations/azure_doc_intelligence/) | Converter | [![PyPI - Version](https://img.shields.io/pypi/v/azure-doc-intelligence-haystack.svg)](https://pypi.org/project/azure-doc-intelligence-haystack) | [![Test / azure_doc_intelligence](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/azure_doc_intelligence.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/azure_doc_intelligence.yml) | diff --git a/integrations/arcadedb/LICENSE.txt b/integrations/arcadedb/LICENSE.txt new file mode 100644 index 0000000000..0fa7906e3a --- /dev/null +++ b/integrations/arcadedb/LICENSE.txt @@ -0,0 +1,190 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to the Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by the Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding any notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2025 ArcadeData Ltd + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/integrations/arcadedb/README.md b/integrations/arcadedb/README.md new file mode 100644 index 0000000000..0ee93f0722 --- /dev/null +++ b/integrations/arcadedb/README.md @@ -0,0 +1,85 @@ +# arcadedb-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/arcadedb-haystack.svg)](https://pypi.org/project/arcadedb-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/arcadedb-haystack.svg)](https://pypi.org/project/arcadedb-haystack) +[![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE.txt) + +**[ArcadeDB](https://arcadedb.com)** integration for [Haystack](https://haystack.deepset.ai/) 2.x. + +ArcadeDB is an open-source multi-model database that combines document storage, HNSW vector search, and SQL metadata filtering in a single engine. This integration provides a `DocumentStore` and `EmbeddingRetriever` that connect to ArcadeDB via its HTTP/JSON API using only the `requests` library -- no special drivers needed. + +## Installation + +```bash +pip install arcadedb-haystack +``` + +## Usage + +Start ArcadeDB: + +```bash +docker run -d -p 2480:2480 \ + -e JAVA_OPTS="-Darcadedb.server.rootPassword=arcadedb" \ + arcadedata/arcadedb:latest + +export ARCADEDB_USERNAME=root +export ARCADEDB_PASSWORD=arcadedb +``` + +### Document Store + +```python +from haystack import Document +from haystack.document_stores.types import DuplicatePolicy +from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore + +store = ArcadeDBDocumentStore( + database="myproject", + embedding_dimension=768, +) + +docs = [ + Document( + content="ArcadeDB supports graphs, documents, and vectors.", + embedding=[0.1] * 768, + meta={"source": "docs", "category": "database"}, + ) +] +store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) +store.filter_documents( + filters={"field": "meta.category", "operator": "==", "value": "database"} +) +``` + +### Pipeline with Embedding Retriever + +```python +from haystack import Pipeline +from haystack_integrations.components.retrievers.arcadedb import ArcadeDBEmbeddingRetriever +from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore + +store = ArcadeDBDocumentStore(database="myproject", embedding_dimension=768) +pipeline = Pipeline() +pipeline.add_component("retriever", ArcadeDBEmbeddingRetriever(document_store=store, top_k=10)) + +result = pipeline.run({"retriever": {"query_embedding": [0.1] * 768}}) +``` + +## Configuration + +| Parameter | Default | Description | +|---|---|---| +| `url` | `http://localhost:2480` | ArcadeDB HTTP endpoint | +| `database` | `haystack` | Database name | +| `username` | env `ARCADEDB_USERNAME` | HTTP Basic Auth username | +| `password` | env `ARCADEDB_PASSWORD` | HTTP Basic Auth password | +| `type_name` | `Document` | Vertex type name | +| `embedding_dimension` | `768` | Vector dimension for HNSW index | +| `similarity_function` | `cosine` | `cosine`, `euclidean`, or `dot` | +| `recreate_type` | `False` | Drop and recreate type on init | +| `create_database` | `True` | Create database if it doesn't exist | + +## License + +`arcadedb-haystack` is distributed under the terms of the [Apache-2.0](LICENSE.txt) license. diff --git a/integrations/arcadedb/examples/embedding_retrieval.py b/integrations/arcadedb/examples/embedding_retrieval.py new file mode 100644 index 0000000000..ffdbad3acd --- /dev/null +++ b/integrations/arcadedb/examples/embedding_retrieval.py @@ -0,0 +1,87 @@ +#!/usr/bin/env python3 +# SPDX-FileCopyrightText: 2025-present ArcadeData Ltd +# +# SPDX-License-Identifier: Apache-2.0 + +""" +Example: Embedding retrieval with ArcadeDB + Haystack. + +Prerequisites: + docker run -d -p 2480:2480 \ + -e JAVA_OPTS="-Darcadedb.server.rootPassword=arcadedb" \ + arcadedata/arcadedb:latest + + pip install arcadedb-haystack + +Usage: + export ARCADEDB_USERNAME=root + export ARCADEDB_PASSWORD=arcadedb + python examples/embedding_retrieval.py +""" + +from haystack import Document, Pipeline +from haystack.document_stores.types import DuplicatePolicy + +from haystack_integrations.components.retrievers.arcadedb import ArcadeDBEmbeddingRetriever +from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore + +# --- 1. Create the document store --- +store = ArcadeDBDocumentStore( + url="http://localhost:2480", + database="haystack_example", + embedding_dimension=4, # small dim for demo + similarity_function="cosine", + recreate_type=True, +) + +# --- 2. Write some documents --- +documents = [ + Document( + content="ArcadeDB is a multi-model database supporting graphs, documents, key-value, time-series, and vectors.", + embedding=[1.0, 0.0, 0.0, 0.0], + meta={"category": "database", "source": "docs"}, + ), + Document( + content="Haystack is an open-source framework for building RAG pipelines.", + embedding=[0.0, 1.0, 0.0, 0.0], + meta={"category": "framework", "source": "docs"}, + ), + Document( + content="HNSW (Hierarchical Navigable Small World) enables fast approximate nearest neighbor search.", + embedding=[0.5, 0.5, 0.0, 0.0], + meta={"category": "algorithm", "source": "paper"}, + ), + Document( + content="Vector databases store high-dimensional embeddings for semantic search.", + embedding=[0.8, 0.2, 0.0, 0.0], + meta={"category": "database", "source": "blog"}, + ), +] + +written = store.write_documents(documents, policy=DuplicatePolicy.OVERWRITE) +print(f"Wrote {written} documents") +print(f"Total documents: {store.count_documents()}") + +# --- 3. Build a retrieval pipeline --- +pipeline = Pipeline() +pipeline.add_component("retriever", ArcadeDBEmbeddingRetriever(document_store=store, top_k=3)) + +# --- 4. Run a similarity search --- +query_embedding = [0.9, 0.1, 0.0, 0.0] # close to "ArcadeDB" and "Vector databases" +result = pipeline.run({"retriever": {"query_embedding": query_embedding}}) + +print("\n--- Top 3 results ---") +for doc in result["retriever"]["documents"]: + print(f" score={doc.score:.4f} category={doc.meta.get('category')} content={doc.content[:80]}...") + +# --- 5. Filter retrieval (only 'database' category) --- +result_filtered = pipeline.run({ + "retriever": { + "query_embedding": query_embedding, + "filters": {"field": "meta.category", "operator": "==", "value": "database"}, + } +}) + +print("\n--- Filtered (category=database) ---") +for doc in result_filtered["retriever"]["documents"]: + print(f" score={doc.score:.4f} content={doc.content[:80]}...") diff --git a/integrations/arcadedb/pydoc/config_docusaurus.yml b/integrations/arcadedb/pydoc/config_docusaurus.yml new file mode 100644 index 0000000000..e3cb005d73 --- /dev/null +++ b/integrations/arcadedb/pydoc/config_docusaurus.yml @@ -0,0 +1,14 @@ +loaders: + - modules: + - haystack_integrations.components.retrievers.arcadedb.embedding_retriever + - haystack_integrations.document_stores.arcadedb.document_store + search_path: [../src] +processors: + - type: filter + documented_only: true + skip_empty_modules: true +renderer: + description: ArcadeDB integration for Haystack + id: integrations-arcadedb + filename: arcadedb.md + title: ArcadeDB diff --git a/integrations/arcadedb/pyproject.toml b/integrations/arcadedb/pyproject.toml new file mode 100644 index 0000000000..c546f3dd87 --- /dev/null +++ b/integrations/arcadedb/pyproject.toml @@ -0,0 +1,117 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "arcadedb-haystack" +dynamic = ["version"] +description = "An integration of ArcadeDB with Haystack — document storage + HNSW vector search + SQL filtering" +readme = "README.md" +requires-python = ">=3.10" +license = "Apache-2.0" +keywords = ["arcadedb", "haystack", "vector-search", "document-store", "rag"] +authors = [{ name = "ArcadeData Ltd", email = "info@arcadedb.com" }] +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [ + "haystack-ai>=2.9.0", + "requests", +] + +[project.urls] +Source = "https://github.com/deepset-ai/haystack-core-integrations" +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/arcadedb/README.md" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + +[tool.hatch.version] +source = "vcs" +tag-pattern = 'integrations\/arcadedb-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/arcadedb-v[0-9]*"' + +[tool.hatch.envs.default] +installer = "uv" +dependencies = ["haystack-pydoc-tools", "ruff"] + +[tool.hatch.envs.default.scripts] +docs = ["haystack-pydoc pydoc/config_docusaurus.yml"] +fmt = "ruff check --fix {args}; ruff format {args}" +fmt-check = "ruff check {args} && ruff format --check {args}" + +[tool.hatch.envs.test] +dependencies = [ + "pytest", + "pytest-cov", + "pytest-rerunfailures", + "mypy", + "pip", +] + +[tool.hatch.envs.test.scripts] +unit = 'pytest -m "not integration" {args:tests}' +integration = 'pytest -m "integration" {args:tests}' +all = 'pytest {args:tests}' +cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x {args:tests}' +types = "mypy -p haystack_integrations.document_stores.arcadedb -p haystack_integrations.components.retrievers.arcadedb {args}" + +[tool.mypy] +install_types = true +non_interactive = true +check_untyped_defs = true +disallow_incomplete_defs = true + +[[tool.mypy.overrides]] +module = ["requests.*"] +ignore_missing_imports = true + +[tool.ruff] +line-length = 120 + +[tool.ruff.lint] +select = [ + "A", "ARG", "B", "C", "DTZ", "E", "EM", "F", "FBT", "I", "ICN", + "ISC", "N", "PLC", "PLE", "PLR", "PLW", "Q", "RUF", "S", "T", + "TID", "UP", "W", "YTT", +] +ignore = [ + "B027", "FBT003", "S105", "S106", "S107", + "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", + "B008", "S101", +] + +[tool.ruff.lint.isort] +known-first-party = ["haystack_integrations"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "parents" + +[tool.ruff.lint.per-file-ignores] +"tests/**/*" = ["PLR2004", "S101", "TID252"] +"examples/**/*" = ["T201"] + +[tool.coverage.run] +source = ["haystack_integrations"] +branch = true +parallel = false + +[tool.coverage.report] +omit = ["*/tests/*", "*/__init__.py"] +show_missing = true +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] + +[tool.pytest.ini_options] +markers = ["integration: integration tests"] diff --git a/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/__init__.py b/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/__init__.py new file mode 100644 index 0000000000..eb4a7bfbe5 --- /dev/null +++ b/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2025-present ArcadeData Ltd +# +# SPDX-License-Identifier: Apache-2.0 + +from haystack_integrations.components.retrievers.arcadedb.embedding_retriever import ArcadeDBEmbeddingRetriever + +__all__ = ["ArcadeDBEmbeddingRetriever"] diff --git a/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py b/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py new file mode 100644 index 0000000000..1624055eb9 --- /dev/null +++ b/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py @@ -0,0 +1,104 @@ +# SPDX-FileCopyrightText: 2025-present ArcadeData Ltd +# +# SPDX-License-Identifier: Apache-2.0 + +"""ArcadeDB Embedding Retriever for Haystack 2.x pipelines.""" + +from typing import Any + +from haystack import Document, component, default_from_dict, default_to_dict +from haystack.document_stores.types import FilterPolicy + +from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore + + +@component +class ArcadeDBEmbeddingRetriever: + """ + Retrieve documents from ArcadeDB using vector similarity (LSM_VECTOR / HNSW index). + + Usage example: + + ```python + from haystack_integrations.components.retrievers.arcadedb import ArcadeDBEmbeddingRetriever + from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore + + store = ArcadeDBDocumentStore(database="mydb") + retriever = ArcadeDBEmbeddingRetriever(document_store=store, top_k=5) + ``` + """ + + def __init__( + self, + *, + document_store: ArcadeDBDocumentStore, + filters: dict[str, Any] | None = None, + top_k: int = 10, + filter_policy: FilterPolicy = FilterPolicy.REPLACE, + ): + """ + Create an ArcadeDBEmbeddingRetriever. + + :param document_store: An instance of ``ArcadeDBDocumentStore``. + :param filters: Default filters applied to every retrieval call. + :param top_k: Maximum number of documents to return. + :param filter_policy: How runtime filters interact with default filters. + """ + self._document_store = document_store + self._filters = filters + self._top_k = top_k + self._filter_policy = filter_policy + + @component.output_types(documents=list[Document]) + def run( + self, + query_embedding: list[float], + filters: dict[str, Any] | None = None, + top_k: int | None = None, + ) -> dict[str, list[Document]]: + """ + Retrieve documents by vector similarity. + + :param query_embedding: The embedding vector to search with. + :param filters: Optional filters to narrow results. + :param top_k: Maximum number of documents to return. + :returns: A dict with key ``"documents"`` containing the retrieved documents. + """ + effective_top_k = top_k if top_k is not None else self._top_k + + if self._filter_policy == FilterPolicy.REPLACE and filters is not None: + effective_filters = filters + elif self._filter_policy == FilterPolicy.MERGE and filters is not None and self._filters is not None: + effective_filters = { + "operator": "AND", + "conditions": [self._filters, filters], + } + else: + effective_filters = filters or self._filters + + documents = self._document_store._embedding_retrieval( + query_embedding=query_embedding, + filters=effective_filters, + top_k=effective_top_k, + ) + return {"documents": documents} + + def to_dict(self) -> dict[str, Any]: + """Serialize this retriever to a dictionary.""" + return default_to_dict( + self, + document_store=self._document_store.to_dict(), + filters=self._filters, + top_k=self._top_k, + filter_policy=self._filter_policy.value, + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "ArcadeDBEmbeddingRetriever": + """Deserialize an ArcadeDBEmbeddingRetriever from a dictionary.""" + init_params = data.get("init_parameters", {}) + if "document_store" in init_params: + init_params["document_store"] = ArcadeDBDocumentStore.from_dict(init_params["document_store"]) + if "filter_policy" in init_params: + init_params["filter_policy"] = FilterPolicy(init_params["filter_policy"]) + return default_from_dict(cls, data) diff --git a/integrations/arcadedb/src/haystack_integrations/components/retrievers/py.typed b/integrations/arcadedb/src/haystack_integrations/components/retrievers/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/__init__.py b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/__init__.py new file mode 100644 index 0000000000..3676df644a --- /dev/null +++ b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2025-present ArcadeData Ltd +# +# SPDX-License-Identifier: Apache-2.0 + +from haystack_integrations.document_stores.arcadedb.document_store import ArcadeDBDocumentStore + +__all__ = ["ArcadeDBDocumentStore"] diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/converters.py b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/converters.py new file mode 100644 index 0000000000..ccafefae70 --- /dev/null +++ b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/converters.py @@ -0,0 +1,38 @@ +# SPDX-FileCopyrightText: 2025-present ArcadeData Ltd +# +# SPDX-License-Identifier: Apache-2.0 + +"""Convert between Haystack Documents and ArcadeDB records.""" + +from typing import Any + +from haystack import Document + + +def _from_haystack_to_arcadedb(documents: list[Document]) -> list[dict[str, Any]]: + """Convert Haystack Documents to dicts suitable for ArcadeDB INSERT.""" + records = [] + for doc in documents: + record: dict[str, Any] = { + "id": doc.id, + "content": doc.content, + "embedding": doc.embedding, + "meta": doc.meta, + } + records.append(record) + return records + + +def _from_arcadedb_to_haystack(records: list[dict[str, Any]]) -> list[Document]: + """Convert ArcadeDB query result rows to Haystack Documents.""" + documents = [] + for record in records: + doc = Document( + id=record["id"], + content=record.get("content"), + embedding=record.get("embedding"), + meta=record.get("meta") or {}, + score=record.get("score"), + ) + documents.append(doc) + return documents diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py new file mode 100644 index 0000000000..1b2a1e3596 --- /dev/null +++ b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py @@ -0,0 +1,400 @@ +# SPDX-FileCopyrightText: 2025-present ArcadeData Ltd +# +# SPDX-License-Identifier: Apache-2.0 + +"""ArcadeDB DocumentStore for Haystack 2.x — document storage + vector search via HTTP/JSON API.""" + +import logging +from typing import Any, ClassVar + +import requests +from haystack import Document, default_from_dict, default_to_dict +from haystack.document_stores.errors import DuplicateDocumentError +from haystack.document_stores.types import DuplicatePolicy +from haystack.utils import Secret + +from haystack_integrations.document_stores.arcadedb.converters import ( + _from_arcadedb_to_haystack, + _from_haystack_to_arcadedb, +) +from haystack_integrations.document_stores.arcadedb.filters import _convert_filters + +logger = logging.getLogger(__name__) + + +class ArcadeDBDocumentStore: + """ + An ArcadeDB-backed DocumentStore for Haystack 2.x. + + Uses ArcadeDB's HTTP/JSON API for all operations — no special drivers required. + Supports HNSW vector search (LSM_VECTOR) and SQL metadata filtering. + + Usage example: + + ```python + from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore + + store = ArcadeDBDocumentStore( + url="http://localhost:2480", + database="haystack", + embedding_dimension=768, + ) + ``` + """ + + # Map user-facing similarity names to ArcadeDB LSM_VECTOR metric keywords + _SIMILARITY_MAP: ClassVar[dict[str, str]] = { + "cosine": "COSINE", + "euclidean": "EUCLIDEAN", + "dot": "DOT_PRODUCT", + } + + def __init__( + self, + *, + url: str = "http://localhost:2480", + database: str = "haystack", + username: Secret = Secret.from_env_var("ARCADEDB_USERNAME", strict=False), # noqa: B008 + password: Secret = Secret.from_env_var("ARCADEDB_PASSWORD", strict=False), # noqa: B008 + type_name: str = "Document", + embedding_dimension: int = 768, + similarity_function: str = "cosine", + recreate_type: bool = False, + create_database: bool = True, + ): + """ + Create an ArcadeDBDocumentStore instance. + + :param url: ArcadeDB HTTP endpoint. + :param database: Database name. + :param username: HTTP Basic Auth username (default: ``ARCADEDB_USERNAME`` env var). + :param password: HTTP Basic Auth password (default: ``ARCADEDB_PASSWORD`` env var). + :param type_name: Vertex type name for documents. + :param embedding_dimension: Vector dimension for the HNSW index. + :param similarity_function: Distance metric — ``"cosine"``, ``"euclidean"``, or ``"dot"``. + :param recreate_type: If ``True``, drop and recreate the type on initialization. + :param create_database: If ``True``, create the database if it doesn't exist. + """ + self._url = url.rstrip("/") + self._database = database + self._username = username + self._password = password + self._type_name = type_name + self._embedding_dimension = embedding_dimension + self._similarity_function = similarity_function + self._recreate_type = recreate_type + self._create_database = create_database + + self._session = requests.Session() + self._initialized = False + + # ------------------------------------------------------------------ + # Serialization (Haystack pipeline export/import) + # ------------------------------------------------------------------ + + def to_dict(self) -> dict[str, Any]: + """Serialize this store to a dictionary.""" + return default_to_dict( + self, + url=self._url, + database=self._database, + username=self._username.to_dict() if self._username else None, + password=self._password.to_dict() if self._password else None, + type_name=self._type_name, + embedding_dimension=self._embedding_dimension, + similarity_function=self._similarity_function, + recreate_type=self._recreate_type, + create_database=self._create_database, + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "ArcadeDBDocumentStore": + """Deserialize an ArcadeDBDocumentStore from a dictionary.""" + init_params = data.get("init_parameters", {}) + for key in ("username", "password"): + if init_params.get(key) is not None: + init_params[key] = Secret.from_dict(init_params[key]) + return default_from_dict(cls, data) + + # ------------------------------------------------------------------ + # HTTP helpers + # ------------------------------------------------------------------ + + def _auth(self) -> tuple[str, str] | None: + user = self._username.resolve_value() if self._username else None + pwd = self._password.resolve_value() if self._password else None + if user and pwd: + return (user, pwd) + return None + + def _command(self, sql: str, *, positional_params: list[Any] | None = None) -> list[dict[str, Any]]: + """Execute an SQL command via the ArcadeDB HTTP API and return result rows.""" + url = f"{self._url}/api/v1/command/{self._database}" + payload: dict[str, Any] = {"language": "sql", "command": sql} + if positional_params: + payload["params"] = positional_params + + resp = self._session.post(url, json=payload, auth=self._auth()) + if resp.status_code >= 400: + msg = f"ArcadeDB command failed ({resp.status_code}): {resp.text}" + raise RuntimeError(msg) + + body = resp.json() + return body.get("result", []) + + def _server_command(self, command: str) -> dict[str, Any]: + """Execute a server-level command (e.g. CREATE DATABASE).""" + url = f"{self._url}/api/v1/server" + resp = self._session.post(url, json={"command": command}, auth=self._auth()) + if resp.status_code >= 400: + msg = f"ArcadeDB server command failed ({resp.status_code}): {resp.text}" + raise RuntimeError(msg) + return resp.json() + + def _ensure_initialized(self) -> None: + if self._initialized: + return + + # 1. Optionally create the database + if self._create_database: + try: + self._server_command(f"CREATE DATABASE {self._database}") + logger.info("Created database '%s'", self._database) + except RuntimeError: + logger.debug("Database '%s' already exists or cannot be created", self._database) + + # 2. Optionally drop existing type + if self._recreate_type: + try: + self._command(f"DROP TYPE `{self._type_name}` IF EXISTS UNSAFE") + except RuntimeError: + pass + + # 3. Create vertex type + properties + self._command(f"CREATE VERTEX TYPE `{self._type_name}` IF NOT EXISTS") + self._command(f"CREATE PROPERTY `{self._type_name}`.id IF NOT EXISTS STRING") + self._command(f"CREATE PROPERTY `{self._type_name}`.content IF NOT EXISTS STRING") + self._command(f"CREATE PROPERTY `{self._type_name}`.embedding IF NOT EXISTS ARRAY_OF_FLOATS") + self._command(f"CREATE PROPERTY `{self._type_name}`.meta IF NOT EXISTS MAP") + + # 4. Unique index on id + try: + self._command(f"CREATE INDEX ON `{self._type_name}` (id) UNIQUE") + except RuntimeError: + logger.debug("Unique index on id already exists") + + # 5. LSM_VECTOR index on embedding (HNSW-based, ACID-compliant) + metric = self._SIMILARITY_MAP.get(self._similarity_function, "COSINE") + try: + self._command( + f"CREATE INDEX IF NOT EXISTS ON `{self._type_name}` (embedding) LSM_VECTOR " + f"METADATA {{ dimensions: {self._embedding_dimension}, similarity: '{metric}' }}" + ) + except RuntimeError: + logger.debug("Vector index on embedding already exists") + + self._initialized = True + logger.info( + "ArcadeDBDocumentStore initialized: database=%s, type=%s, dim=%d, metric=%s", + self._database, + self._type_name, + self._embedding_dimension, + metric, + ) + + # ------------------------------------------------------------------ + # DocumentStore protocol + # ------------------------------------------------------------------ + + def count_documents(self) -> int: + """Return the number of documents stored.""" + self._ensure_initialized() + rows = self._command(f"SELECT count(*) AS cnt FROM `{self._type_name}`") + if rows: + return int(rows[0].get("cnt", 0)) + return 0 + + def filter_documents( + self, + filters: dict[str, Any] | None = None, + ) -> list[Document]: + """ + Return documents matching the given filters. + + :param filters: Haystack filter dictionary. + :returns: List of matching documents. + """ + self._ensure_initialized() + where = _convert_filters(filters) + sql = f"SELECT * FROM `{self._type_name}`" + if where: + sql += f" WHERE {where}" + rows = self._command(sql) + return _from_arcadedb_to_haystack(rows) + + def write_documents( + self, + documents: list[Document], + policy: DuplicatePolicy = DuplicatePolicy.NONE, + ) -> int: + """ + Write documents to the store. + + :param documents: List of Haystack Documents to write. + :param policy: How to handle duplicate document IDs. + :returns: Number of documents written. + """ + self._ensure_initialized() + if not documents: + return 0 + + records = _from_haystack_to_arcadedb(documents) + written = 0 + + for record in records: + embedding_str = str(record["embedding"]) if record["embedding"] else "[]" + meta_str = _map_literal(record["meta"]) if record["meta"] else "{}" + + if policy == DuplicatePolicy.OVERWRITE: + sql = ( + f"UPDATE `{self._type_name}` SET " + f"content = {_sql_str(record['content'])}, " + f"embedding = {embedding_str}, " + f"meta = {meta_str} " + f"WHERE id = {_sql_str(record['id'])}" + ) + result = self._command(sql) + updated = int(result[0].get("count", 0)) if result else 0 + if updated == 0: + self._insert_record(record, embedding_str, meta_str) + written += 1 + + elif policy == DuplicatePolicy.SKIP: + existing = self._command( + f"SELECT id FROM `{self._type_name}` WHERE id = {_sql_str(record['id'])}" + ) + if existing: + continue + self._insert_record(record, embedding_str, meta_str) + written += 1 + + else: + # DuplicatePolicy.NONE — raise on duplicate + existing = self._command( + f"SELECT id FROM `{self._type_name}` WHERE id = {_sql_str(record['id'])}" + ) + if existing: + msg = f"Document with id '{record['id']}' already exists." + raise DuplicateDocumentError(msg) + self._insert_record(record, embedding_str, meta_str) + written += 1 + + return written + + def _insert_record(self, record: dict[str, Any], embedding_str: str, meta_str: str) -> None: + sql = ( + f"INSERT INTO `{self._type_name}` SET " + f"id = {_sql_str(record['id'])}, " + f"content = {_sql_str(record['content'])}, " + f"embedding = {embedding_str}, " + f"meta = {meta_str}" + ) + self._command(sql) + + def delete_documents(self, document_ids: list[str]) -> None: + """ + Delete documents by their IDs. + + :param document_ids: List of document IDs to delete. + """ + self._ensure_initialized() + if not document_ids: + return + ids_str = ", ".join(_sql_str(did) for did in document_ids) + self._command(f"DELETE FROM `{self._type_name}` WHERE id IN [{ids_str}]") + + # ------------------------------------------------------------------ + # Retrieval (called by Retriever components) + # ------------------------------------------------------------------ + + def _embedding_retrieval( + self, + query_embedding: list[float], + *, + filters: dict[str, Any] | None = None, + top_k: int = 10, + ) -> list[Document]: + """ + Retrieve documents by vector similarity using ArcadeDB's LSM_VECTOR index. + + :param query_embedding: The embedding vector to search with. + :param filters: Optional metadata filters (applied as post-filter). + :param top_k: Maximum number of documents to return. + :returns: Documents ordered by descending similarity score. + """ + self._ensure_initialized() + embedding_str = str(query_embedding) + + # vectorNeighbors returns a single row with a "neighbors" list of {record, distance} + sql = ( + f"SELECT vectorNeighbors('{self._type_name}[embedding]', " + f"{embedding_str}, {top_k}) AS neighbors" + ) + rows = self._command(sql) + if not rows or not rows[0].get("neighbors"): + return [] + + neighbors = rows[0]["neighbors"] + where = _convert_filters(filters) + + documents = [] + for neighbor in neighbors: + record = neighbor.get("record", {}) + distance = neighbor.get("distance", 0.0) + score = 1.0 - distance + + doc = Document( + id=record.get("id", ""), + content=record.get("content"), + meta=record.get("meta") or {}, + score=score, + ) + documents.append(doc) + + # Post-filter by metadata if specified + if where and filters: + filtered_ids = { + r["id"] for r in self._command(f"SELECT id FROM `{self._type_name}` WHERE {where}") + } + documents = [d for d in documents if d.id in filtered_ids] + + return documents + + +def _sql_str(value: str | None) -> str: + """Escape and quote a string value for ArcadeDB SQL.""" + if value is None: + return "NULL" + escaped = value.replace("\\", "\\\\").replace("'", "\\'") + return f"'{escaped}'" + + +def _map_literal(meta: dict[str, Any]) -> str: + """Build an ArcadeDB MAP literal from a Python dict.""" + if not meta: + return "{}" + pairs = [] + for key, value in meta.items(): + if isinstance(value, str): + pairs.append(f'"{key}": {_sql_str(value)}') + elif isinstance(value, bool): + pairs.append(f'"{key}": {"true" if value else "false"}') + elif isinstance(value, (int, float)): + pairs.append(f'"{key}": {value}') + elif value is None: + pairs.append(f'"{key}": NULL') + elif isinstance(value, list): + pairs.append(f'"{key}": {value}') + else: + pairs.append(f'"{key}": {_sql_str(str(value))}') + return "{" + ", ".join(pairs) + "}" diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/filters.py b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/filters.py new file mode 100644 index 0000000000..52a7285989 --- /dev/null +++ b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/filters.py @@ -0,0 +1,106 @@ +# SPDX-FileCopyrightText: 2025-present ArcadeData Ltd +# +# SPDX-License-Identifier: Apache-2.0 + +"""Convert Haystack filter dictionaries to ArcadeDB SQL WHERE clauses.""" + +from typing import Any + + +def _convert_filters(filters: dict[str, Any] | None) -> str: + """ + Convert a Haystack filter dictionary to an ArcadeDB SQL WHERE clause. + + Supports comparison operators (==, !=, >, >=, <, <=, in, not in) + and logical operators (AND, OR, NOT). + """ + if not filters: + return "" + return _parse_condition(filters) + + +def _parse_condition(condition: dict[str, Any]) -> str: + operator = condition.get("operator") + if not operator: + msg = f"Missing 'operator' in filter condition: {condition}" + raise ValueError(msg) + + operator_upper = operator.upper() + + if operator_upper in ("AND", "OR"): + conditions = condition.get("conditions", []) + if not conditions: + return "" + parts = [_parse_condition(c) for c in conditions] + parts = [p for p in parts if p] + if not parts: + return "" + if len(parts) == 1: + return parts[0] + joiner = f" {operator_upper} " + return f"({joiner.join(parts)})" + + if operator_upper == "NOT": + conditions = condition.get("conditions", []) + if not conditions: + return "" + inner = _parse_condition(conditions[0]) + return f"NOT ({inner})" if inner else "" + + field = condition.get("field") + value = condition.get("value") + + if not field: + msg = f"Missing 'field' in filter condition: {condition}" + raise ValueError(msg) + + return _comparison_to_sql(field, operator, value) + + +def _comparison_to_sql(field: str, operator: str, value: Any) -> str: + if operator == "==": + if value is None: + return f"{field} IS NULL" + return f"{field} = {_sql_value(value)}" + + if operator == "!=": + if value is None: + return f"{field} IS NOT NULL" + return f"{field} <> {_sql_value(value)}" + + if operator == ">": + return f"{field} > {_sql_value(value)}" + + if operator == ">=": + return f"{field} >= {_sql_value(value)}" + + if operator == "<": + return f"{field} < {_sql_value(value)}" + + if operator == "<=": + return f"{field} <= {_sql_value(value)}" + + if operator == "in": + values = ", ".join(_sql_value(v) for v in value) + return f"{field} IN [{values}]" + + if operator == "not in": + values = ", ".join(_sql_value(v) for v in value) + return f"{field} NOT IN [{values}]" + + msg = f"Unsupported filter operator: {operator}" + raise ValueError(msg) + + +def _sql_value(value: Any) -> str: + """Format a Python value as an ArcadeDB SQL literal.""" + if isinstance(value, str): + escaped = value.replace("'", "\\'") + return f"'{escaped}'" + if isinstance(value, bool): + return "true" if value else "false" + if isinstance(value, (int, float)): + return str(value) + if value is None: + return "NULL" + return f"'{value}'" diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/py.typed b/integrations/arcadedb/src/haystack_integrations/document_stores/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/arcadedb/tests/__init__.py b/integrations/arcadedb/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/arcadedb/tests/test_document_store.py b/integrations/arcadedb/tests/test_document_store.py new file mode 100644 index 0000000000..85e737dd4f --- /dev/null +++ b/integrations/arcadedb/tests/test_document_store.py @@ -0,0 +1,160 @@ +# SPDX-FileCopyrightText: 2025-present ArcadeData Ltd +# +# SPDX-License-Identifier: Apache-2.0 + +import dataclasses +import os + +import pytest +from haystack import Document +from haystack.document_stores.errors import DuplicateDocumentError +from haystack.document_stores.types import DuplicatePolicy + +from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore + +ARCADEDB_URL = os.getenv("ARCADEDB_URL", "http://localhost:2480") + + +@pytest.fixture() +def document_store(): + store = ArcadeDBDocumentStore( + url=ARCADEDB_URL, + database="haystack_test", + embedding_dimension=4, + recreate_type=True, + ) + return store + + +def _sample_docs(n: int = 3, dim: int = 4) -> list[Document]: + docs = [] + for i in range(n): + docs.append( + Document( + content=f"Document number {i}", + embedding=[float(i)] * dim, + meta={"category": "test", "priority": i}, + ) + ) + return docs + + +# --------------------------------------------------------------------------- +# Unit tests (no ArcadeDB required) +# --------------------------------------------------------------------------- + + +class TestSerialization: + def test_to_dict_from_dict(self): + store = ArcadeDBDocumentStore( + url="http://localhost:2480", + database="test_db", + embedding_dimension=4, + ) + data = store.to_dict() + restored = ArcadeDBDocumentStore.from_dict(data) + assert restored._database == store._database + assert restored._embedding_dimension == store._embedding_dimension + assert restored._url == store._url + + +# --------------------------------------------------------------------------- +# Integration tests (require a running ArcadeDB instance) +# --------------------------------------------------------------------------- + + +@pytest.mark.integration +class TestArcadeDBDocumentStoreIntegration: + def test_count_empty(self, document_store): + assert document_store.count_documents() == 0 + + def test_count_after_write(self, document_store): + docs = _sample_docs(5) + document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + assert document_store.count_documents() == 5 + + def test_write_and_read(self, document_store): + docs = _sample_docs(2) + written = document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + assert written == 2 + + all_docs = document_store.filter_documents() + assert len(all_docs) == 2 + + def test_write_overwrite(self, document_store): + docs = _sample_docs(1) + document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + updated = dataclasses.replace(docs[0], content="Updated content") + document_store.write_documents([updated], policy=DuplicatePolicy.OVERWRITE) + + all_docs = document_store.filter_documents() + assert len(all_docs) == 1 + assert all_docs[0].content == "Updated content" + + def test_write_skip(self, document_store): + docs = _sample_docs(1) + document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + written = document_store.write_documents(docs, policy=DuplicatePolicy.SKIP) + assert written == 0 + assert document_store.count_documents() == 1 + + def test_write_duplicate_raises(self, document_store): + docs = _sample_docs(1) + document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + with pytest.raises(DuplicateDocumentError): + document_store.write_documents(docs, policy=DuplicatePolicy.NONE) + + def test_delete(self, document_store): + docs = _sample_docs(3) + document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + ids_to_delete = [docs[0].id, docs[1].id] + document_store.delete_documents(ids_to_delete) + + assert document_store.count_documents() == 1 + + def test_filter_equality(self, document_store): + docs = _sample_docs(3) + document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + result = document_store.filter_documents( + filters={"field": "meta.category", "operator": "==", "value": "test"} + ) + assert len(result) == 3 + + def test_filter_comparison(self, document_store): + docs = _sample_docs(5) + document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + result = document_store.filter_documents( + filters={"field": "meta.priority", "operator": ">", "value": 2} + ) + assert len(result) == 2 # priority 3 and 4 + + def test_filter_and(self, document_store): + docs = _sample_docs(5) + document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + result = document_store.filter_documents( + filters={ + "operator": "AND", + "conditions": [ + {"field": "meta.category", "operator": "==", "value": "test"}, + {"field": "meta.priority", "operator": ">=", "value": 3}, + ], + } + ) + assert len(result) == 2 + + def test_embedding_retrieval(self, document_store): + docs = _sample_docs(5, dim=4) + document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) + + results = document_store._embedding_retrieval( + query_embedding=[4.0, 4.0, 4.0, 4.0], top_k=3 + ) + assert len(results) <= 3 + assert results[0].score is not None diff --git a/integrations/arcadedb/tests/test_filters.py b/integrations/arcadedb/tests/test_filters.py new file mode 100644 index 0000000000..e68e1b135b --- /dev/null +++ b/integrations/arcadedb/tests/test_filters.py @@ -0,0 +1,95 @@ +# SPDX-FileCopyrightText: 2025-present ArcadeData Ltd +# +# SPDX-License-Identifier: Apache-2.0 + +"""Unit tests for filter conversion (no ArcadeDB instance required).""" + +import pytest + +from haystack_integrations.document_stores.arcadedb.filters import _convert_filters + + +class TestFilterConversion: + def test_none_returns_empty(self): + assert _convert_filters(None) == "" + + def test_equality(self): + result = _convert_filters({"field": "meta.name", "operator": "==", "value": "alice"}) + assert result == "meta.name = 'alice'" + + def test_equality_null(self): + result = _convert_filters({"field": "meta.name", "operator": "==", "value": None}) + assert result == "meta.name IS NULL" + + def test_not_equal(self): + result = _convert_filters({"field": "meta.name", "operator": "!=", "value": "bob"}) + assert result == "meta.name <> 'bob'" + + def test_not_equal_null(self): + result = _convert_filters({"field": "meta.name", "operator": "!=", "value": None}) + assert result == "meta.name IS NOT NULL" + + def test_greater_than(self): + result = _convert_filters({"field": "meta.score", "operator": ">", "value": 5}) + assert result == "meta.score > 5" + + def test_in_operator(self): + result = _convert_filters({"field": "meta.tag", "operator": "in", "value": ["a", "b"]}) + assert result == "meta.tag IN ['a', 'b']" + + def test_not_in_operator(self): + result = _convert_filters({"field": "meta.tag", "operator": "not in", "value": ["x"]}) + assert result == "meta.tag NOT IN ['x']" + + def test_and(self): + result = _convert_filters({ + "operator": "AND", + "conditions": [ + {"field": "meta.a", "operator": "==", "value": 1}, + {"field": "meta.b", "operator": ">", "value": 2}, + ], + }) + assert result == "(meta.a = 1 AND meta.b > 2)" + + def test_or(self): + result = _convert_filters({ + "operator": "OR", + "conditions": [ + {"field": "meta.x", "operator": "==", "value": "yes"}, + {"field": "meta.y", "operator": "==", "value": "no"}, + ], + }) + assert result == "(meta.x = 'yes' OR meta.y = 'no')" + + def test_not(self): + result = _convert_filters({ + "operator": "NOT", + "conditions": [ + {"field": "meta.deleted", "operator": "==", "value": True}, + ], + }) + assert result == "NOT (meta.deleted = true)" + + def test_nested(self): + result = _convert_filters({ + "operator": "AND", + "conditions": [ + {"field": "meta.a", "operator": "==", "value": 1}, + { + "operator": "OR", + "conditions": [ + {"field": "meta.b", "operator": "==", "value": 2}, + {"field": "meta.c", "operator": "==", "value": 3}, + ], + }, + ], + }) + assert result == "(meta.a = 1 AND (meta.b = 2 OR meta.c = 3))" + + def test_missing_operator_raises(self): + with pytest.raises(ValueError): + _convert_filters({"field": "x", "value": 1}) + + def test_missing_field_raises(self): + with pytest.raises(ValueError): + _convert_filters({"operator": "==", "value": 1}) From 951a1b52352ab99943fe74a342a2846edd188e88 Mon Sep 17 00:00:00 2001 From: lvca Date: Sat, 28 Feb 2026 18:33:43 -0500 Subject: [PATCH 02/16] fix: resolve ruff lint errors and set requests minimum version - Remove unused noqa: B008 directives (B008 already in ignore list) - Use HTTPStatus.BAD_REQUEST instead of magic value 400 (PLR2004) - Add S608 to ruff ignore (SQL string construction is intentional for ArcadeDB HTTP/JSON API with proper value escaping) - Set requests>=2.28.0 minimum to ensure Python 3.13 compatibility (older versions use removed cgi module) --- integrations/arcadedb/pyproject.toml | 4 +++- .../document_stores/arcadedb/document_store.py | 9 +++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/integrations/arcadedb/pyproject.toml b/integrations/arcadedb/pyproject.toml index c546f3dd87..8538deba04 100644 --- a/integrations/arcadedb/pyproject.toml +++ b/integrations/arcadedb/pyproject.toml @@ -24,7 +24,7 @@ classifiers = [ ] dependencies = [ "haystack-ai>=2.9.0", - "requests", + "requests>=2.28.0", ] [project.urls] @@ -91,6 +91,8 @@ ignore = [ "B027", "FBT003", "S105", "S106", "S107", "C901", "PLR0911", "PLR0912", "PLR0913", "PLR0915", "B008", "S101", + # SQL string construction is intentional — ArcadeDB uses HTTP/JSON API with value escaping + "S608", ] [tool.ruff.lint.isort] diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py index 1b2a1e3596..584339c26e 100644 --- a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py +++ b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py @@ -5,6 +5,7 @@ """ArcadeDB DocumentStore for Haystack 2.x — document storage + vector search via HTTP/JSON API.""" import logging +from http import HTTPStatus from typing import Any, ClassVar import requests @@ -54,8 +55,8 @@ def __init__( *, url: str = "http://localhost:2480", database: str = "haystack", - username: Secret = Secret.from_env_var("ARCADEDB_USERNAME", strict=False), # noqa: B008 - password: Secret = Secret.from_env_var("ARCADEDB_PASSWORD", strict=False), # noqa: B008 + username: Secret = Secret.from_env_var("ARCADEDB_USERNAME", strict=False), + password: Secret = Secret.from_env_var("ARCADEDB_PASSWORD", strict=False), type_name: str = "Document", embedding_dimension: int = 768, similarity_function: str = "cosine", @@ -135,7 +136,7 @@ def _command(self, sql: str, *, positional_params: list[Any] | None = None) -> l payload["params"] = positional_params resp = self._session.post(url, json=payload, auth=self._auth()) - if resp.status_code >= 400: + if resp.status_code >= HTTPStatus.BAD_REQUEST: msg = f"ArcadeDB command failed ({resp.status_code}): {resp.text}" raise RuntimeError(msg) @@ -146,7 +147,7 @@ def _server_command(self, command: str) -> dict[str, Any]: """Execute a server-level command (e.g. CREATE DATABASE).""" url = f"{self._url}/api/v1/server" resp = self._session.post(url, json={"command": command}, auth=self._auth()) - if resp.status_code >= 400: + if resp.status_code >= HTTPStatus.BAD_REQUEST: msg = f"ArcadeDB server command failed ({resp.status_code}): {resp.text}" raise RuntimeError(msg) return resp.json() From e031eb4b9d6461b1fe4ca24b6a093965b59e1036 Mon Sep 17 00:00:00 2001 From: lvca Date: Sat, 28 Feb 2026 18:36:55 -0500 Subject: [PATCH 03/16] style: apply ruff format to all source files --- .../arcadedb/examples/embedding_retrieval.py | 12 +-- .../arcadedb/document_store.py | 17 +---- .../arcadedb/tests/test_document_store.py | 12 +-- integrations/arcadedb/tests/test_filters.py | 74 ++++++++++--------- 4 files changed, 55 insertions(+), 60 deletions(-) diff --git a/integrations/arcadedb/examples/embedding_retrieval.py b/integrations/arcadedb/examples/embedding_retrieval.py index ffdbad3acd..26033a7609 100644 --- a/integrations/arcadedb/examples/embedding_retrieval.py +++ b/integrations/arcadedb/examples/embedding_retrieval.py @@ -75,12 +75,14 @@ print(f" score={doc.score:.4f} category={doc.meta.get('category')} content={doc.content[:80]}...") # --- 5. Filter retrieval (only 'database' category) --- -result_filtered = pipeline.run({ - "retriever": { - "query_embedding": query_embedding, - "filters": {"field": "meta.category", "operator": "==", "value": "database"}, +result_filtered = pipeline.run( + { + "retriever": { + "query_embedding": query_embedding, + "filters": {"field": "meta.category", "operator": "==", "value": "database"}, + } } -}) +) print("\n--- Filtered (category=database) ---") for doc in result_filtered["retriever"]["documents"]: diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py index 584339c26e..7cb71acff3 100644 --- a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py +++ b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py @@ -271,9 +271,7 @@ def write_documents( written += 1 elif policy == DuplicatePolicy.SKIP: - existing = self._command( - f"SELECT id FROM `{self._type_name}` WHERE id = {_sql_str(record['id'])}" - ) + existing = self._command(f"SELECT id FROM `{self._type_name}` WHERE id = {_sql_str(record['id'])}") if existing: continue self._insert_record(record, embedding_str, meta_str) @@ -281,9 +279,7 @@ def write_documents( else: # DuplicatePolicy.NONE — raise on duplicate - existing = self._command( - f"SELECT id FROM `{self._type_name}` WHERE id = {_sql_str(record['id'])}" - ) + existing = self._command(f"SELECT id FROM `{self._type_name}` WHERE id = {_sql_str(record['id'])}") if existing: msg = f"Document with id '{record['id']}' already exists." raise DuplicateDocumentError(msg) @@ -337,10 +333,7 @@ def _embedding_retrieval( embedding_str = str(query_embedding) # vectorNeighbors returns a single row with a "neighbors" list of {record, distance} - sql = ( - f"SELECT vectorNeighbors('{self._type_name}[embedding]', " - f"{embedding_str}, {top_k}) AS neighbors" - ) + sql = f"SELECT vectorNeighbors('{self._type_name}[embedding]', {embedding_str}, {top_k}) AS neighbors" rows = self._command(sql) if not rows or not rows[0].get("neighbors"): return [] @@ -364,9 +357,7 @@ def _embedding_retrieval( # Post-filter by metadata if specified if where and filters: - filtered_ids = { - r["id"] for r in self._command(f"SELECT id FROM `{self._type_name}` WHERE {where}") - } + filtered_ids = {r["id"] for r in self._command(f"SELECT id FROM `{self._type_name}` WHERE {where}")} documents = [d for d in documents if d.id in filtered_ids] return documents diff --git a/integrations/arcadedb/tests/test_document_store.py b/integrations/arcadedb/tests/test_document_store.py index 85e737dd4f..a8bdbde32d 100644 --- a/integrations/arcadedb/tests/test_document_store.py +++ b/integrations/arcadedb/tests/test_document_store.py @@ -120,18 +120,14 @@ def test_filter_equality(self, document_store): docs = _sample_docs(3) document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - result = document_store.filter_documents( - filters={"field": "meta.category", "operator": "==", "value": "test"} - ) + result = document_store.filter_documents(filters={"field": "meta.category", "operator": "==", "value": "test"}) assert len(result) == 3 def test_filter_comparison(self, document_store): docs = _sample_docs(5) document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - result = document_store.filter_documents( - filters={"field": "meta.priority", "operator": ">", "value": 2} - ) + result = document_store.filter_documents(filters={"field": "meta.priority", "operator": ">", "value": 2}) assert len(result) == 2 # priority 3 and 4 def test_filter_and(self, document_store): @@ -153,8 +149,6 @@ def test_embedding_retrieval(self, document_store): docs = _sample_docs(5, dim=4) document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - results = document_store._embedding_retrieval( - query_embedding=[4.0, 4.0, 4.0, 4.0], top_k=3 - ) + results = document_store._embedding_retrieval(query_embedding=[4.0, 4.0, 4.0, 4.0], top_k=3) assert len(results) <= 3 assert results[0].score is not None diff --git a/integrations/arcadedb/tests/test_filters.py b/integrations/arcadedb/tests/test_filters.py index e68e1b135b..7beb75006f 100644 --- a/integrations/arcadedb/tests/test_filters.py +++ b/integrations/arcadedb/tests/test_filters.py @@ -42,48 +42,56 @@ def test_not_in_operator(self): assert result == "meta.tag NOT IN ['x']" def test_and(self): - result = _convert_filters({ - "operator": "AND", - "conditions": [ - {"field": "meta.a", "operator": "==", "value": 1}, - {"field": "meta.b", "operator": ">", "value": 2}, - ], - }) + result = _convert_filters( + { + "operator": "AND", + "conditions": [ + {"field": "meta.a", "operator": "==", "value": 1}, + {"field": "meta.b", "operator": ">", "value": 2}, + ], + } + ) assert result == "(meta.a = 1 AND meta.b > 2)" def test_or(self): - result = _convert_filters({ - "operator": "OR", - "conditions": [ - {"field": "meta.x", "operator": "==", "value": "yes"}, - {"field": "meta.y", "operator": "==", "value": "no"}, - ], - }) + result = _convert_filters( + { + "operator": "OR", + "conditions": [ + {"field": "meta.x", "operator": "==", "value": "yes"}, + {"field": "meta.y", "operator": "==", "value": "no"}, + ], + } + ) assert result == "(meta.x = 'yes' OR meta.y = 'no')" def test_not(self): - result = _convert_filters({ - "operator": "NOT", - "conditions": [ - {"field": "meta.deleted", "operator": "==", "value": True}, - ], - }) + result = _convert_filters( + { + "operator": "NOT", + "conditions": [ + {"field": "meta.deleted", "operator": "==", "value": True}, + ], + } + ) assert result == "NOT (meta.deleted = true)" def test_nested(self): - result = _convert_filters({ - "operator": "AND", - "conditions": [ - {"field": "meta.a", "operator": "==", "value": 1}, - { - "operator": "OR", - "conditions": [ - {"field": "meta.b", "operator": "==", "value": 2}, - {"field": "meta.c", "operator": "==", "value": 3}, - ], - }, - ], - }) + result = _convert_filters( + { + "operator": "AND", + "conditions": [ + {"field": "meta.a", "operator": "==", "value": 1}, + { + "operator": "OR", + "conditions": [ + {"field": "meta.b", "operator": "==", "value": 2}, + {"field": "meta.c", "operator": "==", "value": 3}, + ], + }, + ], + } + ) assert result == "(meta.a = 1 AND (meta.b = 2 OR meta.c = 3))" def test_missing_operator_raises(self): From 2f1f2a6bae49505c37cc6fdbb8b7bd550bc03606 Mon Sep 17 00:00:00 2001 From: lvca Date: Sat, 28 Feb 2026 18:41:06 -0500 Subject: [PATCH 04/16] fix: add type annotation to resolve mypy assignment error --- .../components/retrievers/arcadedb/embedding_retriever.py | 1 + 1 file changed, 1 insertion(+) diff --git a/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py b/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py index 1624055eb9..58555ac471 100644 --- a/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py +++ b/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py @@ -66,6 +66,7 @@ def run( """ effective_top_k = top_k if top_k is not None else self._top_k + effective_filters: dict[str, Any] | None if self._filter_policy == FilterPolicy.REPLACE and filters is not None: effective_filters = filters elif self._filter_policy == FilterPolicy.MERGE and filters is not None and self._filters is not None: From cd3c8a47310838ac17d1430cbe430bcb2722c9f6 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 2 Mar 2026 13:14:59 +0100 Subject: [PATCH 05/16] Apply suggestions from code review --- .github/labeler.yml | 5 ++ integrations/arcadedb/README.md | 80 ++----------------- integrations/arcadedb/pyproject.toml | 2 +- .../arcadedb/embedding_retriever.py | 37 ++++++++- .../arcadedb/document_store.py | 26 +++++- 5 files changed, 67 insertions(+), 83 deletions(-) diff --git a/.github/labeler.yml b/.github/labeler.yml index a4add7d750..e0b9185e8c 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -24,6 +24,11 @@ integration:anthropic: - any-glob-to-any-file: "integrations/anthropic/**/*" - any-glob-to-any-file: ".github/workflows/anthropic.yml" +integration:arcadedb: + - changed-files: + - any-glob-to-any-file: "integrations/arcadedb/**/*" + - any-glob-to-any-file: ".github/workflows/arcadedb.yml" + integration:astra: - changed-files: - any-glob-to-any-file: "integrations/astra/**/*" diff --git a/integrations/arcadedb/README.md b/integrations/arcadedb/README.md index 0ee93f0722..eb0ac897ca 100644 --- a/integrations/arcadedb/README.md +++ b/integrations/arcadedb/README.md @@ -4,82 +4,12 @@ [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/arcadedb-haystack.svg)](https://pypi.org/project/arcadedb-haystack) [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE.txt) -**[ArcadeDB](https://arcadedb.com)** integration for [Haystack](https://haystack.deepset.ai/) 2.x. +- [Integration page](https://haystack.deepset.ai/integrations/arcadedb) +- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/arcadedb/CHANGELOG.md) -ArcadeDB is an open-source multi-model database that combines document storage, HNSW vector search, and SQL metadata filtering in a single engine. This integration provides a `DocumentStore` and `EmbeddingRetriever` that connect to ArcadeDB via its HTTP/JSON API using only the `requests` library -- no special drivers needed. +--- -## Installation +## Contributing -```bash -pip install arcadedb-haystack -``` +Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md). -## Usage - -Start ArcadeDB: - -```bash -docker run -d -p 2480:2480 \ - -e JAVA_OPTS="-Darcadedb.server.rootPassword=arcadedb" \ - arcadedata/arcadedb:latest - -export ARCADEDB_USERNAME=root -export ARCADEDB_PASSWORD=arcadedb -``` - -### Document Store - -```python -from haystack import Document -from haystack.document_stores.types import DuplicatePolicy -from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore - -store = ArcadeDBDocumentStore( - database="myproject", - embedding_dimension=768, -) - -docs = [ - Document( - content="ArcadeDB supports graphs, documents, and vectors.", - embedding=[0.1] * 768, - meta={"source": "docs", "category": "database"}, - ) -] -store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) -store.filter_documents( - filters={"field": "meta.category", "operator": "==", "value": "database"} -) -``` - -### Pipeline with Embedding Retriever - -```python -from haystack import Pipeline -from haystack_integrations.components.retrievers.arcadedb import ArcadeDBEmbeddingRetriever -from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore - -store = ArcadeDBDocumentStore(database="myproject", embedding_dimension=768) -pipeline = Pipeline() -pipeline.add_component("retriever", ArcadeDBEmbeddingRetriever(document_store=store, top_k=10)) - -result = pipeline.run({"retriever": {"query_embedding": [0.1] * 768}}) -``` - -## Configuration - -| Parameter | Default | Description | -|---|---|---| -| `url` | `http://localhost:2480` | ArcadeDB HTTP endpoint | -| `database` | `haystack` | Database name | -| `username` | env `ARCADEDB_USERNAME` | HTTP Basic Auth username | -| `password` | env `ARCADEDB_PASSWORD` | HTTP Basic Auth password | -| `type_name` | `Document` | Vertex type name | -| `embedding_dimension` | `768` | Vector dimension for HNSW index | -| `similarity_function` | `cosine` | `cosine`, `euclidean`, or `dot` | -| `recreate_type` | `False` | Drop and recreate type on init | -| `create_database` | `True` | Create database if it doesn't exist | - -## License - -`arcadedb-haystack` is distributed under the terms of the [Apache-2.0](LICENSE.txt) license. diff --git a/integrations/arcadedb/pyproject.toml b/integrations/arcadedb/pyproject.toml index 8538deba04..d79b975003 100644 --- a/integrations/arcadedb/pyproject.toml +++ b/integrations/arcadedb/pyproject.toml @@ -10,7 +10,7 @@ readme = "README.md" requires-python = ">=3.10" license = "Apache-2.0" keywords = ["arcadedb", "haystack", "vector-search", "document-store", "rag"] -authors = [{ name = "ArcadeData Ltd", email = "info@arcadedb.com" }] +authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }, { name = "ArcadeData Ltd", email = "info@arcadedb.com" }] classifiers = [ "License :: OSI Approved :: Apache Software License", "Development Status :: 4 - Beta", diff --git a/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py b/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py index 58555ac471..4181205a18 100644 --- a/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py +++ b/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py @@ -20,11 +20,29 @@ class ArcadeDBEmbeddingRetriever: Usage example: ```python + from haystack import Document + from haystack.components.embedders import SentenceTransformersTextEmbedder from haystack_integrations.components.retrievers.arcadedb import ArcadeDBEmbeddingRetriever from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore store = ArcadeDBDocumentStore(database="mydb") retriever = ArcadeDBEmbeddingRetriever(document_store=store, top_k=5) + + # Add documents to DocumentStore + documents = [ + Document(text="My name is Carla and I live in Berlin"), + Document(text="My name is Paul and I live in New York"), + Document(text="My name is Silvano and I live in Matera"), + Document(text="My name is Usagi Tsukino and I live in Tokyo"), + ] + document_store.write_documents(documents) + + embedder = SentenceTransformersTextEmbedder() + query_embeddings = embedder.run("Who lives in Berlin?")["embedding"] + + result = retriever.run(query=query_embeddings) + for doc in result["documents"]: + print(doc.content) ``` """ @@ -62,7 +80,8 @@ def run( :param query_embedding: The embedding vector to search with. :param filters: Optional filters to narrow results. :param top_k: Maximum number of documents to return. - :returns: A dict with key ``"documents"`` containing the retrieved documents. + :returns: A dictionary with the following keys: + - `documents`: List of `Document`s most similar to the given `query_embedding` """ effective_top_k = top_k if top_k is not None else self._top_k @@ -85,7 +104,12 @@ def run( return {"documents": documents} def to_dict(self) -> dict[str, Any]: - """Serialize this retriever to a dictionary.""" + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ return default_to_dict( self, document_store=self._document_store.to_dict(), @@ -96,7 +120,14 @@ def to_dict(self) -> dict[str, Any]: @classmethod def from_dict(cls, data: dict[str, Any]) -> "ArcadeDBEmbeddingRetriever": - """Deserialize an ArcadeDBEmbeddingRetriever from a dictionary.""" + """ + Deserializes the component from a dictionary. + + :param data: + Dictionary to deserialize from. + :returns: + Deserialized component. + """ init_params = data.get("init_parameters", {}) if "document_store" in init_params: init_params["document_store"] = ArcadeDBDocumentStore.from_dict(init_params["document_store"]) diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py index 7cb71acff3..aade562b19 100644 --- a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py +++ b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py @@ -33,13 +33,18 @@ class ArcadeDBDocumentStore: Usage example: ```python + from haystack.dataclasses.document import Document from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore - store = ArcadeDBDocumentStore( + document_store = ArcadeDBDocumentStore( url="http://localhost:2480", database="haystack", embedding_dimension=768, ) + document_store.write_documents([ + Document(content="This is first", embedding=[0.0]*5), + Document(content="This is second", embedding=[0.1, 0.2, 0.3, 0.4, 0.5]) + ]) ``` """ @@ -94,7 +99,11 @@ def __init__( # ------------------------------------------------------------------ def to_dict(self) -> dict[str, Any]: - """Serialize this store to a dictionary.""" + """ + Serializes the DocumentStore to a dictionary. + + :returns: + Dictionary with serialized data. return default_to_dict( self, url=self._url, @@ -110,7 +119,13 @@ def to_dict(self) -> dict[str, Any]: @classmethod def from_dict(cls, data: dict[str, Any]) -> "ArcadeDBDocumentStore": - """Deserialize an ArcadeDBDocumentStore from a dictionary.""" + """ + Deserializes the DocumentStore from a dictionary. + + :param data: + The dictionary to deserialize from. + :returns: + The deserialized DocumentStore. init_params = data.get("init_parameters", {}) for key in ("username", "password"): if init_params.get(key) is not None: @@ -208,7 +223,10 @@ def _ensure_initialized(self) -> None: # ------------------------------------------------------------------ def count_documents(self) -> int: - """Return the number of documents stored.""" + Returns how many documents are present in the document store. + + :returns: + Number of documents in the document store. self._ensure_initialized() rows = self._command(f"SELECT count(*) AS cnt FROM `{self._type_name}`") if rows: From 9922db947c82c9b6793d765f0ebf26e8ec3b2f79 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 2 Mar 2026 13:21:04 +0100 Subject: [PATCH 06/16] format and fix docstrings --- .../components/retrievers/arcadedb/embedding_retriever.py | 4 ++-- .../document_stores/arcadedb/document_store.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py b/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py index 4181205a18..8fc7bee800 100644 --- a/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py +++ b/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py @@ -27,7 +27,7 @@ class ArcadeDBEmbeddingRetriever: store = ArcadeDBDocumentStore(database="mydb") retriever = ArcadeDBEmbeddingRetriever(document_store=store, top_k=5) - + # Add documents to DocumentStore documents = [ Document(text="My name is Carla and I live in Berlin"), @@ -36,7 +36,7 @@ class ArcadeDBEmbeddingRetriever: Document(text="My name is Usagi Tsukino and I live in Tokyo"), ] document_store.write_documents(documents) - + embedder = SentenceTransformersTextEmbedder() query_embeddings = embedder.run("Who lives in Berlin?")["embedding"] diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py index aade562b19..27a2d831e4 100644 --- a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py +++ b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py @@ -104,6 +104,7 @@ def to_dict(self) -> dict[str, Any]: :returns: Dictionary with serialized data. + """ return default_to_dict( self, url=self._url, @@ -126,6 +127,7 @@ def from_dict(cls, data: dict[str, Any]) -> "ArcadeDBDocumentStore": The dictionary to deserialize from. :returns: The deserialized DocumentStore. + """ init_params = data.get("init_parameters", {}) for key in ("username", "password"): if init_params.get(key) is not None: @@ -223,10 +225,12 @@ def _ensure_initialized(self) -> None: # ------------------------------------------------------------------ def count_documents(self) -> int: + """ Returns how many documents are present in the document store. :returns: Number of documents in the document store. + """ self._ensure_initialized() rows = self._command(f"SELECT count(*) AS cnt FROM `{self._type_name}`") if rows: From 6827d0f65c239c686bfdcf04e962fe4b5db98615 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 2 Mar 2026 13:24:55 +0100 Subject: [PATCH 07/16] Update integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py --- .../components/retrievers/arcadedb/embedding_retriever.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py b/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py index 8fc7bee800..ab829a5d21 100644 --- a/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py +++ b/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py @@ -2,8 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -"""ArcadeDB Embedding Retriever for Haystack 2.x pipelines.""" - from typing import Any from haystack import Document, component, default_from_dict, default_to_dict From fc8a7a80ad91de91501c93d76dfb93dff6378b1c Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 2 Mar 2026 13:25:08 +0100 Subject: [PATCH 08/16] Update .github/labeler.yml --- .github/labeler.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/labeler.yml b/.github/labeler.yml index e0b9185e8c..20cb6edf9f 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -4,11 +4,6 @@ integration:aimlapi: - any-glob-to-any-file: "integrations/aimlapi/**/*" - any-glob-to-any-file: ".github/workflows/aimlapi.yml" -integration:arcadedb: - - changed-files: - - any-glob-to-any-file: "integrations/arcadedb/**/*" - - any-glob-to-any-file: ".github/workflows/arcadedb.yml" - integration:amazon-bedrock: - changed-files: - any-glob-to-any-file: "integrations/amazon_bedrock/**/*" From 031932f699b0d09331cb9295d67e7775ed9af6e6 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 2 Mar 2026 13:25:16 +0100 Subject: [PATCH 09/16] Update integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py --- .../document_stores/arcadedb/document_store.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py index 27a2d831e4..41f6849c13 100644 --- a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py +++ b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py @@ -94,10 +94,6 @@ def __init__( self._session = requests.Session() self._initialized = False - # ------------------------------------------------------------------ - # Serialization (Haystack pipeline export/import) - # ------------------------------------------------------------------ - def to_dict(self) -> dict[str, Any]: """ Serializes the DocumentStore to a dictionary. From 5b29fe107f00c96baf782db17f9529e7ec877988 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 2 Mar 2026 13:38:14 +0100 Subject: [PATCH 10/16] update license for consistency --- integrations/arcadedb/LICENSE.txt | 2 +- integrations/arcadedb/examples/embedding_retrieval.py | 2 +- .../components/retrievers/arcadedb/__init__.py | 2 +- .../components/retrievers/arcadedb/embedding_retriever.py | 2 +- .../haystack_integrations/document_stores/arcadedb/__init__.py | 2 +- .../document_stores/arcadedb/converters.py | 2 +- .../document_stores/arcadedb/document_store.py | 2 +- .../haystack_integrations/document_stores/arcadedb/filters.py | 2 +- integrations/arcadedb/tests/test_document_store.py | 2 +- integrations/arcadedb/tests/test_filters.py | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/integrations/arcadedb/LICENSE.txt b/integrations/arcadedb/LICENSE.txt index 0fa7906e3a..1c8582b372 100644 --- a/integrations/arcadedb/LICENSE.txt +++ b/integrations/arcadedb/LICENSE.txt @@ -175,7 +175,7 @@ END OF TERMS AND CONDITIONS - Copyright 2025 ArcadeData Ltd + Copyright 2023-present deepset GmbH Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/integrations/arcadedb/examples/embedding_retrieval.py b/integrations/arcadedb/examples/embedding_retrieval.py index 26033a7609..cf539f65b9 100644 --- a/integrations/arcadedb/examples/embedding_retrieval.py +++ b/integrations/arcadedb/examples/embedding_retrieval.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# SPDX-FileCopyrightText: 2025-present ArcadeData Ltd +# SPDX-FileCopyrightText: 2025-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/__init__.py b/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/__init__.py index eb4a7bfbe5..da774b7d53 100644 --- a/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/__init__.py +++ b/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2025-present ArcadeData Ltd +# SPDX-FileCopyrightText: 2025-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py b/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py index ab829a5d21..ba36a58e4c 100644 --- a/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py +++ b/integrations/arcadedb/src/haystack_integrations/components/retrievers/arcadedb/embedding_retriever.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2025-present ArcadeData Ltd +# SPDX-FileCopyrightText: 2025-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/__init__.py b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/__init__.py index 3676df644a..d72ad983c9 100644 --- a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/__init__.py +++ b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/__init__.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2025-present ArcadeData Ltd +# SPDX-FileCopyrightText: 2025-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/converters.py b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/converters.py index ccafefae70..61be17a587 100644 --- a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/converters.py +++ b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/converters.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2025-present ArcadeData Ltd +# SPDX-FileCopyrightText: 2025-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py index 41f6849c13..c2a3b4e28c 100644 --- a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py +++ b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2025-present ArcadeData Ltd +# SPDX-FileCopyrightText: 2025-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/filters.py b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/filters.py index 52a7285989..850e444541 100644 --- a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/filters.py +++ b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/filters.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2025-present ArcadeData Ltd +# SPDX-FileCopyrightText: 2025-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/arcadedb/tests/test_document_store.py b/integrations/arcadedb/tests/test_document_store.py index a8bdbde32d..e9e7f2d40c 100644 --- a/integrations/arcadedb/tests/test_document_store.py +++ b/integrations/arcadedb/tests/test_document_store.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2025-present ArcadeData Ltd +# SPDX-FileCopyrightText: 2025-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/arcadedb/tests/test_filters.py b/integrations/arcadedb/tests/test_filters.py index 7beb75006f..3423007c08 100644 --- a/integrations/arcadedb/tests/test_filters.py +++ b/integrations/arcadedb/tests/test_filters.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2025-present ArcadeData Ltd +# SPDX-FileCopyrightText: 2025-present deepset GmbH # # SPDX-License-Identifier: Apache-2.0 From 3498a7a39b6a7f75648ff30ea1572c8ecbb67c81 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 2 Mar 2026 13:55:51 +0100 Subject: [PATCH 11/16] use mixin DocumentStore tests and unify error handling --- .../arcadedb/document_store.py | 16 +- .../document_stores/arcadedb/filters.py | 33 ++-- .../arcadedb/tests/test_document_store.py | 141 +++++++----------- 3 files changed, 94 insertions(+), 96 deletions(-) diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py index c2a3b4e28c..2c99528d8b 100644 --- a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py +++ b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py @@ -12,6 +12,7 @@ from haystack import Document, default_from_dict, default_to_dict from haystack.document_stores.errors import DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy +from haystack.errors import FilterError from haystack.utils import Secret from haystack_integrations.document_stores.arcadedb.converters import ( @@ -244,7 +245,10 @@ def filter_documents( :returns: List of matching documents. """ self._ensure_initialized() - where = _convert_filters(filters) + try: + where = _convert_filters(filters) + except ValueError as e: + raise FilterError(str(e)) from e sql = f"SELECT * FROM `{self._type_name}`" if where: sql += f" WHERE {where}" @@ -264,6 +268,11 @@ def write_documents( :returns: Number of documents written. """ self._ensure_initialized() + if not isinstance(documents, list): + raise ValueError("documents must be a list of Document objects") + for doc in documents: + if not isinstance(doc, Document): + raise ValueError("documents must be a list of Document objects") if not documents: return 0 @@ -271,7 +280,10 @@ def write_documents( written = 0 for record in records: - embedding_str = str(record["embedding"]) if record["embedding"] else "[]" + emb = record["embedding"] + if emb is None or not isinstance(emb, list) or len(emb) != self._embedding_dimension: + emb = [0.0] * self._embedding_dimension + embedding_str = str(emb) meta_str = _map_literal(record["meta"]) if record["meta"] else "{}" if policy == DuplicatePolicy.OVERWRITE: diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/filters.py b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/filters.py index 850e444541..d089898a58 100644 --- a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/filters.py +++ b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/filters.py @@ -28,6 +28,9 @@ def _parse_condition(condition: dict[str, Any]) -> str: operator_upper = operator.upper() if operator_upper in ("AND", "OR"): + if "conditions" not in condition: + msg = f"Missing 'conditions' in filter: {condition}" + raise ValueError(msg) conditions = condition.get("conditions", []) if not conditions: return "" @@ -53,6 +56,9 @@ def _parse_condition(condition: dict[str, Any]) -> str: if not field: msg = f"Missing 'field' in filter condition: {condition}" raise ValueError(msg) + if "value" not in condition: + msg = f"Missing 'value' in filter condition: {condition}" + raise ValueError(msg) return _comparison_to_sql(field, operator, value) @@ -68,23 +74,28 @@ def _comparison_to_sql(field: str, operator: str, value: Any) -> str: return f"{field} IS NOT NULL" return f"{field} <> {_sql_value(value)}" - if operator == ">": - return f"{field} > {_sql_value(value)}" - - if operator == ">=": - return f"{field} >= {_sql_value(value)}" - - if operator == "<": - return f"{field} < {_sql_value(value)}" - - if operator == "<=": - return f"{field} <= {_sql_value(value)}" + if operator in (">", ">=", "<", "<="): + if value is None: + return "1 = 0" + if isinstance(value, list): + msg = "Comparison operators require numeric or datetime values, not list" + raise ValueError(msg) + if isinstance(value, str) and "T" not in value: + msg = "Comparison operators require numeric or datetime (ISO) values, not plain string" + raise ValueError(msg) + return f"{field} {operator} {_sql_value(value)}" if operator == "in": + if not isinstance(value, list): + msg = "Operator 'in' requires value to be a list" + raise ValueError(msg) values = ", ".join(_sql_value(v) for v in value) return f"{field} IN [{values}]" if operator == "not in": + if not isinstance(value, list): + msg = "Operator 'not in' requires value to be a list" + raise ValueError(msg) values = ", ".join(_sql_value(v) for v in value) return f"{field} NOT IN [{values}]" diff --git a/integrations/arcadedb/tests/test_document_store.py b/integrations/arcadedb/tests/test_document_store.py index e9e7f2d40c..11c3ed3820 100644 --- a/integrations/arcadedb/tests/test_document_store.py +++ b/integrations/arcadedb/tests/test_document_store.py @@ -9,23 +9,13 @@ from haystack import Document from haystack.document_stores.errors import DuplicateDocumentError from haystack.document_stores.types import DuplicatePolicy +from haystack.testing.document_store import DocumentStoreBaseTests from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore ARCADEDB_URL = os.getenv("ARCADEDB_URL", "http://localhost:2480") -@pytest.fixture() -def document_store(): - store = ArcadeDBDocumentStore( - url=ARCADEDB_URL, - database="haystack_test", - embedding_dimension=4, - recreate_type=True, - ) - return store - - def _sample_docs(n: int = 3, dim: int = 4) -> list[Document]: docs = [] for i in range(n): @@ -64,24 +54,49 @@ def test_to_dict_from_dict(self): @pytest.mark.integration -class TestArcadeDBDocumentStoreIntegration: - def test_count_empty(self, document_store): - assert document_store.count_documents() == 0 - - def test_count_after_write(self, document_store): - docs = _sample_docs(5) - document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - assert document_store.count_documents() == 5 - - def test_write_and_read(self, document_store): - docs = _sample_docs(2) - written = document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - assert written == 2 - - all_docs = document_store.filter_documents() - assert len(all_docs) == 2 +class TestArcadeDBDocumentStore(DocumentStoreBaseTests): + """ + Run Haystack DocumentStore mixin tests against ArcadeDBDocumentStore. + + Base tests cover: count_documents, delete_documents, filter_documents, write_documents. + ArcadeDB does not implement delete_all_documents, delete_by_filter, or update_by_filter, + so DocumentStoreBaseTests (not Extended) is used. + """ + + @pytest.fixture + def document_store(self, document_store: ArcadeDBDocumentStore) -> ArcadeDBDocumentStore: + """Override to provide ArcadeDB document store from conftest.""" + yield document_store + + def assert_documents_are_equal(self, received: list[Document], expected: list[Document]): + """ + Compare document lists for tests. Clear score (filter_documents does not set it; + embedding_retrieval does). Compare embeddings approximately for float round-trip. + Documents written without embeddings get zero-padded in the store; treat as None for comparison. + """ + assert len(received) == len(expected) + received = sorted(received, key=lambda x: x.id) + expected = sorted(expected, key=lambda x: x.id) + for received_doc, expected_doc in zip(received, expected, strict=True): + received_doc.score = None + if expected_doc.embedding is None: + received_doc.embedding = None + elif received_doc.embedding is None: + assert expected_doc.embedding is None + else: + assert received_doc.embedding == pytest.approx(expected_doc.embedding) + received_doc.embedding, expected_doc.embedding = None, None + assert received_doc == expected_doc + + def test_write_documents(self, document_store: ArcadeDBDocumentStore): + """Override mixin: test default write_documents and duplicate fail behaviour.""" + docs = [Document(id="1")] + assert document_store.write_documents(docs) == 1 + with pytest.raises(DuplicateDocumentError): + document_store.write_documents(docs, policy=DuplicatePolicy.FAIL) - def test_write_overwrite(self, document_store): + def test_write_overwrite(self, document_store: ArcadeDBDocumentStore): + """ArcadeDB-specific: overwrite updates content.""" docs = _sample_docs(1) document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) @@ -92,63 +107,23 @@ def test_write_overwrite(self, document_store): assert len(all_docs) == 1 assert all_docs[0].content == "Updated content" - def test_write_skip(self, document_store): - docs = _sample_docs(1) - document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - - written = document_store.write_documents(docs, policy=DuplicatePolicy.SKIP) - assert written == 0 - assert document_store.count_documents() == 1 - - def test_write_duplicate_raises(self, document_store): - docs = _sample_docs(1) - document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - - with pytest.raises(DuplicateDocumentError): - document_store.write_documents(docs, policy=DuplicatePolicy.NONE) - - def test_delete(self, document_store): - docs = _sample_docs(3) - document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - - ids_to_delete = [docs[0].id, docs[1].id] - document_store.delete_documents(ids_to_delete) - - assert document_store.count_documents() == 1 - - def test_filter_equality(self, document_store): - docs = _sample_docs(3) - document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - - result = document_store.filter_documents(filters={"field": "meta.category", "operator": "==", "value": "test"}) - assert len(result) == 3 - - def test_filter_comparison(self, document_store): - docs = _sample_docs(5) - document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - - result = document_store.filter_documents(filters={"field": "meta.priority", "operator": ">", "value": 2}) - assert len(result) == 2 # priority 3 and 4 - - def test_filter_and(self, document_store): - docs = _sample_docs(5) + def test_embedding_retrieval(self, document_store: ArcadeDBDocumentStore): + """ArcadeDB-specific: vector search via _embedding_retrieval.""" + # Use store's embedding_dimension (768 from conftest); create small test docs + dim = document_store._embedding_dimension + docs = [ + Document( + content=f"Document number {i}", + embedding=[float(i)] * dim, + meta={"category": "test", "priority": i}, + ) + for i in range(5) + ] document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - result = document_store.filter_documents( - filters={ - "operator": "AND", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "test"}, - {"field": "meta.priority", "operator": ">=", "value": 3}, - ], - } + results = document_store._embedding_retrieval( + query_embedding=[4.0] * dim, + top_k=3, ) - assert len(result) == 2 - - def test_embedding_retrieval(self, document_store): - docs = _sample_docs(5, dim=4) - document_store.write_documents(docs, policy=DuplicatePolicy.OVERWRITE) - - results = document_store._embedding_retrieval(query_embedding=[4.0, 4.0, 4.0, 4.0], top_k=3) assert len(results) <= 3 assert results[0].score is not None From a535440ef3894c930e2691cb6e546a3fe11d27ae Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 2 Mar 2026 13:58:20 +0100 Subject: [PATCH 12/16] reuse variable in raise ValueError calls --- .../document_stores/arcadedb/document_store.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py index 2c99528d8b..fb5941a169 100644 --- a/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py +++ b/integrations/arcadedb/src/haystack_integrations/document_stores/arcadedb/document_store.py @@ -268,11 +268,12 @@ def write_documents( :returns: Number of documents written. """ self._ensure_initialized() + msg = "documents must be a list of Document objects" if not isinstance(documents, list): - raise ValueError("documents must be a list of Document objects") + raise ValueError(msg) for doc in documents: if not isinstance(doc, Document): - raise ValueError("documents must be a list of Document objects") + raise ValueError(msg) if not documents: return 0 From 36d08ddeb172fed8132bee9e37bb482b7442123f Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 2 Mar 2026 14:01:06 +0100 Subject: [PATCH 13/16] add conftest --- integrations/arcadedb/tests/conftest.py | 28 +++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 integrations/arcadedb/tests/conftest.py diff --git a/integrations/arcadedb/tests/conftest.py b/integrations/arcadedb/tests/conftest.py new file mode 100644 index 0000000000..93da0a28a0 --- /dev/null +++ b/integrations/arcadedb/tests/conftest.py @@ -0,0 +1,28 @@ +# SPDX-FileCopyrightText: 2025-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +"""Pytest fixtures for ArcadeDB integration tests.""" + +import os + +import pytest + +from haystack_integrations.document_stores.arcadedb import ArcadeDBDocumentStore + +ARCADEDB_URL = os.getenv("ARCADEDB_URL", "http://localhost:2480") + + +@pytest.fixture +def document_store(): + """ + ArcadeDB document store instance for integration tests. + + """ + store = ArcadeDBDocumentStore( + url=ARCADEDB_URL, + database="haystack_test", + embedding_dimension=768, + recreate_type=True, + ) + return store From 4d8b624361d7a54e068d10d275b51293fffa77e6 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 2 Mar 2026 14:08:26 +0100 Subject: [PATCH 14/16] use action secret for ArcadeDB --- .github/workflows/arcadedb.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/arcadedb.yml b/.github/workflows/arcadedb.yml index 28b57526f9..81d37c33d6 100644 --- a/.github/workflows/arcadedb.yml +++ b/.github/workflows/arcadedb.yml @@ -19,7 +19,7 @@ env: PYTHONUNBUFFERED: "1" FORCE_COLOR: "1" ARCADEDB_USERNAME: "root" - ARCADEDB_PASSWORD: "arcadedb" + ARCADEDB_PASSWORD: ${{ secrets.ARCADEDB_PASSWORD }} defaults: run: @@ -38,7 +38,7 @@ jobs: arcadedb: image: arcadedata/arcadedb:latest env: - JAVA_OPTS: "-Darcadedb.server.rootPassword=arcadedb" + JAVA_OPTS: "-Darcadedb.server.rootPassword=${{ secrets.ARCADEDB_PASSWORD }}" ports: - 2480:2480 From b379a1444c252a8be0f4e564262ff9d2703a09c4 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 2 Mar 2026 14:19:18 +0100 Subject: [PATCH 15/16] wait for ArcadeDB service to start --- .github/workflows/arcadedb.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/workflows/arcadedb.yml b/.github/workflows/arcadedb.yml index 81d37c33d6..288c97cd0f 100644 --- a/.github/workflows/arcadedb.yml +++ b/.github/workflows/arcadedb.yml @@ -53,6 +53,20 @@ jobs: - name: Install Hatch run: pip install hatch "virtualenv<21.0.0" + - name: Wait for ArcadeDB + run: | + timeout=60 + until [ $timeout -le 0 ] || curl -sSf -u "$ARCADEDB_USERNAME:$ARCADEDB_PASSWORD" http://localhost:2480/api/v1/server > /dev/null; do + echo "Waiting for ArcadeDB service to start... ($timeout s left)" + sleep 5 + timeout=$((timeout - 5)) + done + if [ $timeout -le 0 ]; then + echo "Timed out waiting for ArcadeDB service to start." + exit 1 + fi + echo "ArcadeDB is ready." + - name: Lint if: matrix.python-version == '3.10' && runner.os == 'Linux' run: hatch run fmt-check && hatch run test:types From dea7a7e287e5d0ebd3d5a6737fac8ad832549dfe Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Mon, 2 Mar 2026 14:27:32 +0100 Subject: [PATCH 16/16] use default ARCADEDB_PASSWORD in forks --- .github/workflows/arcadedb.yml | 18 +++--------------- .../arcadedb/tests/test_document_store.py | 14 ++++---------- 2 files changed, 7 insertions(+), 25 deletions(-) diff --git a/.github/workflows/arcadedb.yml b/.github/workflows/arcadedb.yml index 288c97cd0f..58c3cb0f31 100644 --- a/.github/workflows/arcadedb.yml +++ b/.github/workflows/arcadedb.yml @@ -19,6 +19,7 @@ env: PYTHONUNBUFFERED: "1" FORCE_COLOR: "1" ARCADEDB_USERNAME: "root" + # Only set in main repo (secrets not passed to fork workflows); integration tests skip when unset ARCADEDB_PASSWORD: ${{ secrets.ARCADEDB_PASSWORD }} defaults: @@ -38,7 +39,8 @@ jobs: arcadedb: image: arcadedata/arcadedb:latest env: - JAVA_OPTS: "-Darcadedb.server.rootPassword=${{ secrets.ARCADEDB_PASSWORD }}" + # Default password so container starts in forks; main repo uses secret + JAVA_OPTS: "-Darcadedb.server.rootPassword=${{ secrets.ARCADEDB_PASSWORD || 'arcadedb' }}" ports: - 2480:2480 @@ -53,20 +55,6 @@ jobs: - name: Install Hatch run: pip install hatch "virtualenv<21.0.0" - - name: Wait for ArcadeDB - run: | - timeout=60 - until [ $timeout -le 0 ] || curl -sSf -u "$ARCADEDB_USERNAME:$ARCADEDB_PASSWORD" http://localhost:2480/api/v1/server > /dev/null; do - echo "Waiting for ArcadeDB service to start... ($timeout s left)" - sleep 5 - timeout=$((timeout - 5)) - done - if [ $timeout -le 0 ]; then - echo "Timed out waiting for ArcadeDB service to start." - exit 1 - fi - echo "ArcadeDB is ready." - - name: Lint if: matrix.python-version == '3.10' && runner.os == 'Linux' run: hatch run fmt-check && hatch run test:types diff --git a/integrations/arcadedb/tests/test_document_store.py b/integrations/arcadedb/tests/test_document_store.py index 11c3ed3820..fb0f0ca4a8 100644 --- a/integrations/arcadedb/tests/test_document_store.py +++ b/integrations/arcadedb/tests/test_document_store.py @@ -29,11 +29,6 @@ def _sample_docs(n: int = 3, dim: int = 4) -> list[Document]: return docs -# --------------------------------------------------------------------------- -# Unit tests (no ArcadeDB required) -# --------------------------------------------------------------------------- - - class TestSerialization: def test_to_dict_from_dict(self): store = ArcadeDBDocumentStore( @@ -48,11 +43,10 @@ def test_to_dict_from_dict(self): assert restored._url == store._url -# --------------------------------------------------------------------------- -# Integration tests (require a running ArcadeDB instance) -# --------------------------------------------------------------------------- - - +@pytest.mark.skipif( + not os.environ.get("ARCADEDB_PASSWORD"), + reason="Set ARCADEDB_PASSWORD (e.g. via repo secret in CI) to run integration tests.", +) @pytest.mark.integration class TestArcadeDBDocumentStore(DocumentStoreBaseTests): """