Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 128 additions & 0 deletions integrations/pinecone/tests/test_document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from haystack import Document
from haystack.components.preprocessors import DocumentSplitter
from haystack.components.retrievers import SentenceWindowRetriever
from haystack.dataclasses import ByteStream, SparseEmbedding
from haystack.document_stores.types import DuplicatePolicy
from haystack.testing.document_store import (
CountDocumentsByFilterTest,
CountDocumentsTest,
Expand Down Expand Up @@ -230,6 +232,132 @@ def test_convert_meta_to_int():
assert PineconeDocumentStore._convert_meta_to_int(meta_data) == {}


@pytest.mark.parametrize(
("documents", "expected", "warning_fragment"),
[
([], {}, None),
(
[Document(content="hello", meta={"flag": True})],
{"content": {"type": "text"}, "flag": {"type": "boolean"}},
None,
),
(
[Document(content=None, meta={"tags": ["a", "b"]})],
{"tags": {"type": "keyword"}},
None,
),
(
[Document(content=None, meta={"counts": [1, 2]})],
{"counts": {"type": "long"}},
None,
),
(
[Document(content=None, meta={"empty": []})],
{"empty": {"type": "keyword"}},
None,
),
(
[Document(content=None, meta={"pi": 3.14})],
{"pi": {"type": "long"}},
None,
),
(
[
Document(content=None, meta={"value": 1}),
Document(content=None, meta={"value": "two"}),
],
{"value": {"type": "keyword"}},
"mixed types",
),
],
)
def test_get_metadata_fields_info_impl_type_inference(documents, expected, warning_fragment, caplog):
with caplog.at_level("WARNING"):
result = PineconeDocumentStore._get_metadata_fields_info_impl(documents)
assert result == expected
if warning_fragment:
assert warning_fragment in caplog.text


def test_get_metadata_field_min_max_impl_strips_meta_prefix_and_errors():
docs = [
Document(content="a", meta={"priority": 1}),
Document(content="b", meta={"priority": 5}),
]
assert PineconeDocumentStore._get_metadata_field_min_max_impl(docs, "meta.priority") == {"min": 1, "max": 5}

with pytest.raises(ValueError, match="No values found"):
PineconeDocumentStore._get_metadata_field_min_max_impl(docs, "missing")


def test_get_metadata_field_unique_values_impl_pagination_search_and_lists():
docs = [
Document(content="a", meta={"tags": ["python", "java"]}),
Document(content="b", meta={"tags": ["rust", "go"]}),
Document(content="c", meta={"tags": ["python"]}),
]

values, total = PineconeDocumentStore._get_metadata_field_unique_values_impl(
docs, "tags", search_term=None, from_=0, size=10
)
assert total == 4
assert values == ["go", "java", "python", "rust"]

values, total = PineconeDocumentStore._get_metadata_field_unique_values_impl(
docs, "tags", search_term=None, from_=1, size=2
)
assert total == 4
assert values == ["java", "python"]

values, total = PineconeDocumentStore._get_metadata_field_unique_values_impl(
docs, "tags", search_term="PY", from_=0, size=10
)
assert total == 1
assert values == ["python"]


def test_prepare_documents_for_writing_edge_cases(caplog):
ds = PineconeDocumentStore(api_key=Secret.from_token("fake-api-key"))

with pytest.raises(ValueError, match="must contain a list of objects of type Document"):
ds._prepare_documents_for_writing(["not-a-document"], policy=DuplicatePolicy.NONE)

docs = [
Document(content="no-embedding"),
Document(content="with-blob", embedding=[0.1] * 768, blob=ByteStream(data=b"data")),
Document(
content="with-sparse",
embedding=[0.1] * 768,
sparse_embedding=SparseEmbedding(indices=[0], values=[1.0]),
),
]
with caplog.at_level("WARNING"):
result = ds._prepare_documents_for_writing(docs, policy=DuplicatePolicy.SKIP)

assert len(result) == 3
assert result[0][1] == ds._dummy_vector
assert "only supports `DuplicatePolicy.OVERWRITE`" in caplog.text
assert "has no embedding" in caplog.text
assert "blob" in caplog.text
assert "sparse_embedding" in caplog.text


@pytest.mark.asyncio
async def test_validation_errors_on_empty_query_and_non_dict_meta():
ds = PineconeDocumentStore(api_key=Secret.from_token("fake-api-key"))
filters = {"field": "meta.category", "operator": "==", "value": "A"}

with pytest.raises(ValueError, match="query_embedding must be a non-empty list"):
ds._embedding_retrieval(query_embedding=[])
with pytest.raises(ValueError, match="query_embedding must be a non-empty list"):
await ds._embedding_retrieval_async(query_embedding=[])

with pytest.raises(ValueError, match="meta must be a dictionary"):
ds.update_by_filter(filters=filters, meta="not-a-dict")
with pytest.raises(ValueError, match="meta must be a dictionary"):
await ds.update_by_filter_async(filters=filters, meta="not-a-dict")


@pytest.mark.integration
@pytest.mark.skipif(not os.environ.get("PINECONE_API_KEY"), reason="PINECONE_API_KEY not set")
def test_serverless_index_creation_from_scratch(delete_sleep_time):
Expand Down
5 changes: 5 additions & 0 deletions integrations/pinecone/tests/test_embedding_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@ def test_init_default():
PineconeEmbeddingRetriever(document_store=mock_store, filter_policy="invalid")


def test_init_raises_for_non_pinecone_document_store():
with pytest.raises(ValueError, match="document_store must be an instance of PineconeDocumentStore"):
PineconeEmbeddingRetriever(document_store="not-a-document-store")


@patch("haystack_integrations.document_stores.pinecone.document_store.Pinecone")
def test_to_dict(mock_pinecone, monkeypatch):
monkeypatch.setenv("PINECONE_API_KEY", "env-api-key")
Expand Down
114 changes: 114 additions & 0 deletions integrations/pinecone/tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,124 @@

import pytest
from haystack.dataclasses.document import Document
from haystack.errors import FilterError
from haystack.testing.document_store import (
FilterDocumentsTest,
)

from haystack_integrations.document_stores.pinecone.filters import (
_normalize_filters,
_validate_filters,
)


def test_normalize_filters_rejects_non_dict():
with pytest.raises(FilterError, match="Filters must be a dictionary"):
_normalize_filters("not-a-dict")


@pytest.mark.parametrize(
("operator", "value", "expected"),
[
("==", "foo", {"field": {"$eq": "foo"}}),
("!=", 5, {"field": {"$ne": 5}}),
(">", 1.5, {"field": {"$gt": 1.5}}),
(">=", 2, {"field": {"$gte": 2}}),
("<", 3, {"field": {"$lt": 3}}),
("<=", 4.2, {"field": {"$lte": 4.2}}),
("in", ["a", "b"], {"field": {"$in": ["a", "b"]}}),
("not in", [1, 2], {"field": {"$nin": [1, 2]}}),
],
)
def test_comparison_operators(operator, value, expected):
condition = {"field": "field", "operator": operator, "value": value}
assert _normalize_filters(condition) == expected


@pytest.mark.parametrize(
("operator", "value"),
[
(">", "not-a-number"),
(">=", "not-a-number"),
("<", "not-a-number"),
("<=", "not-a-number"),
("==", [1, 2]),
("!=", [1, 2]),
],
)
def test_comparison_rejects_unsupported_value_types(operator, value):
condition = {"field": "field", "operator": operator, "value": value}
with pytest.raises(FilterError, match="Unsupported type"):
_normalize_filters(condition)


@pytest.mark.parametrize(
("operator", "value", "match"),
[
("in", "not-a-list", "must be a list"),
("not in", "not-a-list", "must be a list"),
("in", [{"nested": "dict"}], "Unsupported type"),
("not in", [{"nested": "dict"}], "Unsupported type"),
],
)
def test_in_and_not_in_errors(operator, value, match):
with pytest.raises(FilterError, match=match):
_normalize_filters({"field": "field", "operator": operator, "value": value})


@pytest.mark.parametrize(
("condition", "match"),
[
({"conditions": []}, "'operator' key missing"),
({"operator": "AND"}, "'conditions' key missing"),
(
{"operator": "XOR", "conditions": [{"field": "a", "operator": "==", "value": 1}]},
"Unknown logical operator",
),
],
)
def test_logical_condition_errors(condition, match):
with pytest.raises(FilterError, match=match):
_normalize_filters(condition)


@pytest.mark.parametrize(
("condition", "match"),
[
({"field": "a", "value": 1}, "'operator' key missing"),
({"field": "a", "operator": "=="}, "'value' key missing"),
],
)
def test_comparison_condition_errors(condition, match):
with pytest.raises(FilterError, match=match):
_normalize_filters(condition)


def test_meta_prefix_is_stripped():
condition = {"field": "meta.category", "operator": "==", "value": "A"}
assert _normalize_filters(condition) == {"category": {"$eq": "A"}}


def test_nested_logical_conditions_are_parsed():
filters = {
"operator": "AND",
"conditions": [
{
"operator": "OR",
"conditions": [
{"field": "a", "operator": "==", "value": 1},
{"field": "b", "operator": ">", "value": 2},
],
},
],
}
assert _normalize_filters(filters) == {"$and": [{"$or": [{"a": {"$eq": 1}}, {"b": {"$gt": 2}}]}]}


def test_validate_filters_rejects_invalid_syntax():
with pytest.raises(ValueError, match="Invalid filter syntax"):
_validate_filters({"foo": "bar"})


@pytest.mark.integration
@pytest.mark.skipif(not os.environ.get("PINECONE_API_KEY"), reason="PINECONE_API_KEY not set")
Expand Down
Loading