Skip to content

Commit 3eb9a38

Browse files
committed
test(amazon-s3-vectors): adopt Haystack base test mixins for integration tests
Replace the handcrafted integration suite with Haystack's standard DocumentStore test contract, following the pgvector / pinecone pattern: * tests/conftest.py: shared `document_store` fixture. One vector bucket per session, one index per test for isolation. Wraps `write_documents` and `delete_documents` to (a) inject a default zero embedding for any Document missing one, since S3 Vectors requires embeddings, and (b) sleep briefly afterwards to absorb eventual consistency. * test_document_store.py: appends `TestDocumentStore` integration class inheriting CountDocumentsTest, WriteDocumentsTest, DeleteDocumentsTest, FilterableDocsFixtureMixin. Overrides `assert_documents_are_equal` for float32 round-trip tolerance. * test_filters.py: appends `TestFilters` inheriting FilterDocumentsTest. `filter_documents` already routes matching through haystack's document_matches_filter, so no operators are skipped. * test_integration.py: trimmed to the retriever-specific tests not covered by the base mixins (embedding retrieval, retriever component, to_dict/from_dict roundtrip on a live store). Coverage goes from 12 handcrafted integration tests to 53 (10 store + 39 filter + 4 retriever). Addresses #3149 (comment)
1 parent c7b42e2 commit 3eb9a38

4 files changed

Lines changed: 288 additions & 192 deletions

File tree

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
"""
6+
Shared pytest fixtures for the Amazon S3 Vectors integration tests.
7+
8+
The `document_store` fixture used by the Haystack base test mixins
9+
(`CountDocumentsTest`, `WriteDocumentsTest`, `DeleteDocumentsTest`,
10+
`FilterDocumentsTest`) is defined here.
11+
12+
A single vector bucket is created per test session (creating buckets is the
13+
slow part). Each test gets its own vector index inside that shared bucket so
14+
state is isolated.
15+
16+
Two quirks of S3 Vectors are smoothed over here so the generic base tests
17+
can run unchanged:
18+
19+
1. **Embeddings are required.** The base tests write `Document(content="...")`
20+
without an embedding. We wrap `write_documents` so any document missing an
21+
embedding gets a deterministic zero vector of the right dimension.
22+
2. **Writes are eventually consistent.** We sleep briefly after `write_documents`
23+
and `delete_documents` so the subsequent `filter_documents`/`count_documents`
24+
reflects the new state.
25+
"""
26+
27+
from __future__ import annotations
28+
29+
import os
30+
import time
31+
import uuid
32+
import warnings
33+
from collections.abc import Iterator
34+
35+
import boto3
36+
import pytest
37+
from botocore.exceptions import ClientError
38+
from haystack.dataclasses import Document
39+
from haystack.document_stores.types import DuplicatePolicy
40+
41+
from haystack_integrations.document_stores.amazon_s3_vectors import S3VectorsDocumentStore
42+
43+
# Dimension used by Haystack's `FilterableDocsFixtureMixin` test corpus.
44+
DIMENSION = 768
45+
REGION = os.environ.get("AWS_DEFAULT_REGION", "us-east-1")
46+
47+
# Eventual-consistency budget for S3 Vectors write/delete propagation.
48+
WRITE_SLEEP_SECONDS = 5
49+
DELETE_SLEEP_SECONDS = 5
50+
51+
52+
def _aws_credentials_available() -> bool:
53+
"""Return True if any boto3 credential source is configured."""
54+
if any(os.environ.get(k) for k in ("AWS_ACCESS_KEY_ID", "AWS_PROFILE", "AWS_ROLE_ARN")):
55+
return True
56+
try:
57+
return boto3.Session().get_credentials() is not None
58+
except Exception:
59+
return False
60+
61+
62+
def pytest_collection_modifyitems(config: pytest.Config, items: list[pytest.Item]) -> None: # noqa: ARG001
63+
"""Skip every integration test when AWS credentials are not configured."""
64+
if _aws_credentials_available():
65+
return
66+
skip = pytest.mark.skip(reason="AWS credentials not configured")
67+
for item in items:
68+
if "integration" in item.keywords:
69+
item.add_marker(skip)
70+
71+
72+
@pytest.fixture(scope="session")
73+
def s3_vectors_bucket() -> Iterator[str]:
74+
"""Create one vector bucket for the test session, tear it down at the end."""
75+
if not _aws_credentials_available():
76+
pytest.skip("AWS credentials not configured")
77+
78+
bucket_name = f"haystack-test-{uuid.uuid4().hex[:8]}"
79+
client = boto3.client("s3vectors", region_name=REGION)
80+
client.create_vector_bucket(vectorBucketName=bucket_name)
81+
82+
yield bucket_name
83+
84+
# Tear down: delete every index in the bucket, then the bucket itself.
85+
try:
86+
next_token: str | None = None
87+
while True:
88+
kwargs: dict = {"vectorBucketName": bucket_name}
89+
if next_token:
90+
kwargs["nextToken"] = next_token
91+
response = client.list_indexes(**kwargs)
92+
for idx in response.get("indexes", []):
93+
try:
94+
client.delete_index(vectorBucketName=bucket_name, indexName=idx["indexName"])
95+
except ClientError:
96+
pass
97+
next_token = response.get("nextToken")
98+
if not next_token:
99+
break
100+
except ClientError:
101+
pass
102+
103+
try:
104+
client.delete_vector_bucket(vectorBucketName=bucket_name)
105+
except ClientError:
106+
pass
107+
108+
109+
@pytest.fixture
110+
def document_store(s3_vectors_bucket: str) -> Iterator[S3VectorsDocumentStore]:
111+
"""
112+
Provide a fresh S3VectorsDocumentStore (one per test) with `write_documents`
113+
and `delete_documents` wrapped to:
114+
115+
* inject a default embedding for any `Document` that doesn't have one, and
116+
* sleep briefly afterwards to absorb S3 Vectors' eventual consistency.
117+
"""
118+
index_name = f"idx-{uuid.uuid4().hex[:10]}"
119+
120+
store = S3VectorsDocumentStore(
121+
vector_bucket_name=s3_vectors_bucket,
122+
index_name=index_name,
123+
dimension=DIMENSION,
124+
distance_metric="cosine",
125+
region_name=REGION,
126+
create_bucket_and_index=True,
127+
non_filterable_metadata_keys=[],
128+
)
129+
130+
# Eagerly create the index so the first write doesn't race with index creation.
131+
store._get_client()
132+
133+
original_write = store.write_documents
134+
135+
def write_with_defaults(documents: list[Document], policy: DuplicatePolicy = DuplicatePolicy.OVERWRITE) -> int:
136+
# Only mutate well-formed input; bad input must surface as ValueError from
137+
# the production code, not crash this wrapper.
138+
if isinstance(documents, list):
139+
with warnings.catch_warnings():
140+
warnings.simplefilter("ignore")
141+
for d in documents:
142+
if isinstance(d, Document) and d.embedding is None:
143+
d.embedding = [0.0] * DIMENSION
144+
result = original_write(documents, policy)
145+
time.sleep(WRITE_SLEEP_SECONDS)
146+
return result
147+
148+
store.write_documents = write_with_defaults # type: ignore[method-assign]
149+
150+
original_delete = store.delete_documents
151+
152+
def delete_with_sleep(document_ids: list[str]) -> None:
153+
original_delete(document_ids)
154+
time.sleep(DELETE_SLEEP_SECONDS)
155+
156+
store.delete_documents = delete_with_sleep # type: ignore[method-assign]
157+
158+
yield store
159+
160+
# Clean up: drop the per-test index. Best-effort.
161+
try:
162+
store._get_client().delete_index(vectorBucketName=s3_vectors_bucket, indexName=index_name)
163+
except ClientError:
164+
pass

integrations/amazon_s3_vectors/tests/test_document_store.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,19 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5+
from dataclasses import replace
56
from unittest.mock import MagicMock, patch
67

78
import pytest
89
from haystack.dataclasses import Document
910
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
1011
from haystack.document_stores.types import DuplicatePolicy
12+
from haystack.testing.document_store import (
13+
CountDocumentsTest,
14+
DeleteDocumentsTest,
15+
FilterableDocsFixtureMixin,
16+
WriteDocumentsTest,
17+
)
1118

1219
from haystack_integrations.document_stores.amazon_s3_vectors import S3VectorsDocumentStore
1320

@@ -268,3 +275,45 @@ def test_document_roundtrip():
268275
assert restored.content == doc.content
269276
assert restored.embedding == doc.embedding
270277
assert restored.meta == doc.meta
278+
279+
280+
# ---------------------------------------------------------------------------
281+
# Integration tests — exercise a real S3 Vectors bucket via the `document_store`
282+
# fixture in conftest.py. The mixins below come from Haystack's test kit so we
283+
# get its standard Document Store contract for free.
284+
# ---------------------------------------------------------------------------
285+
286+
287+
@pytest.mark.integration
288+
class TestDocumentStore(
289+
CountDocumentsTest,
290+
WriteDocumentsTest,
291+
DeleteDocumentsTest,
292+
FilterableDocsFixtureMixin,
293+
):
294+
def assert_documents_are_equal(self, received: list[Document], expected: list[Document]) -> None:
295+
"""
296+
Compare documents while tolerating two S3 Vectors quirks:
297+
298+
* embeddings round-trip through float32 storage, so we use `pytest.approx`;
299+
* the `score` field is not set by `filter_documents`, only by retrieval, so
300+
we ignore it for equality.
301+
"""
302+
assert len(received) == len(expected)
303+
received.sort(key=lambda d: d.id)
304+
expected.sort(key=lambda d: d.id)
305+
for r, e in zip(received, expected, strict=True):
306+
r_norm = replace(r, embedding=None, score=None)
307+
e_norm = replace(e, embedding=None, score=None)
308+
assert r_norm == e_norm
309+
if r.embedding is not None and e.embedding is not None:
310+
assert r.embedding == pytest.approx(e.embedding, abs=1e-5)
311+
312+
def test_write_documents(self, document_store: S3VectorsDocumentStore) -> None:
313+
"""
314+
Default behaviour is OVERWRITE (S3 Vectors `put_vectors` is upsert), so
315+
writing the same document twice succeeds and returns 1 each time.
316+
"""
317+
docs = [Document(id="1", content="hello", embedding=[0.1] * 768)]
318+
assert document_store.write_documents(docs) == 1
319+
assert document_store.write_documents(docs) == 1

integrations/amazon_s3_vectors/tests/test_filters.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,12 @@
99
meta. prefix stripping, logical nesting) so unit testing is high signal.
1010
"""
1111

12+
from dataclasses import replace
13+
1214
import pytest
15+
from haystack.dataclasses import Document
1316
from haystack.errors import FilterError
17+
from haystack.testing.document_store import FilterDocumentsTest
1418

1519
from haystack_integrations.document_stores.amazon_s3_vectors.filters import _normalize_filters, _validate_filters
1620

@@ -87,3 +91,26 @@ def test_validate_filters():
8791
_validate_filters({"operator": "AND", "conditions": []}) # valid structure
8892
with pytest.raises(ValueError, match="Invalid filter syntax"):
8993
_validate_filters({"field": "meta.x"}) # missing operator/conditions
94+
95+
96+
# ---------------------------------------------------------------------------
97+
# Integration tests — run Haystack's full filter contract against a real S3
98+
# Vectors index. `S3VectorsDocumentStore.filter_documents` delegates the
99+
# actual matching to `haystack.utils.filters.document_matches_filter`, so the
100+
# only S3-specific quirk we have to absorb here is the float32 round-trip on
101+
# embeddings.
102+
# ---------------------------------------------------------------------------
103+
104+
105+
@pytest.mark.integration
106+
class TestFilters(FilterDocumentsTest):
107+
def assert_documents_are_equal(self, received: list[Document], expected: list[Document]) -> None:
108+
assert len(received) == len(expected)
109+
received.sort(key=lambda d: d.id)
110+
expected.sort(key=lambda d: d.id)
111+
for r, e in zip(received, expected, strict=True):
112+
r_norm = replace(r, embedding=None, score=None)
113+
e_norm = replace(e, embedding=None, score=None)
114+
assert r_norm == e_norm
115+
if r.embedding is not None and e.embedding is not None:
116+
assert r.embedding == pytest.approx(e.embedding, abs=1e-5)

0 commit comments

Comments
 (0)