diff --git a/integrations/pinecone/tests/test_document_store.py b/integrations/pinecone/tests/test_document_store.py index 03bd4653a1..cd0940303d 100644 --- a/integrations/pinecone/tests/test_document_store.py +++ b/integrations/pinecone/tests/test_document_store.py @@ -11,6 +11,8 @@ from haystack import Document from haystack.components.preprocessors import DocumentSplitter from haystack.components.retrievers import SentenceWindowRetriever +from haystack.dataclasses import ByteStream, SparseEmbedding +from haystack.document_stores.types import DuplicatePolicy from haystack.testing.document_store import ( CountDocumentsByFilterTest, CountDocumentsTest, @@ -230,6 +232,132 @@ def test_convert_meta_to_int(): assert PineconeDocumentStore._convert_meta_to_int(meta_data) == {} +@pytest.mark.parametrize( + ("documents", "expected", "warning_fragment"), + [ + ([], {}, None), + ( + [Document(content="hello", meta={"flag": True})], + {"content": {"type": "text"}, "flag": {"type": "boolean"}}, + None, + ), + ( + [Document(content=None, meta={"tags": ["a", "b"]})], + {"tags": {"type": "keyword"}}, + None, + ), + ( + [Document(content=None, meta={"counts": [1, 2]})], + {"counts": {"type": "long"}}, + None, + ), + ( + [Document(content=None, meta={"empty": []})], + {"empty": {"type": "keyword"}}, + None, + ), + ( + [Document(content=None, meta={"pi": 3.14})], + {"pi": {"type": "long"}}, + None, + ), + ( + [ + Document(content=None, meta={"value": 1}), + Document(content=None, meta={"value": "two"}), + ], + {"value": {"type": "keyword"}}, + "mixed types", + ), + ], +) +def test_get_metadata_fields_info_impl_type_inference(documents, expected, warning_fragment, caplog): + with caplog.at_level("WARNING"): + result = PineconeDocumentStore._get_metadata_fields_info_impl(documents) + assert result == expected + if warning_fragment: + assert warning_fragment in caplog.text + + +def test_get_metadata_field_min_max_impl_strips_meta_prefix_and_errors(): + docs = [ + Document(content="a", meta={"priority": 1}), + Document(content="b", meta={"priority": 5}), + ] + assert PineconeDocumentStore._get_metadata_field_min_max_impl(docs, "meta.priority") == {"min": 1, "max": 5} + + with pytest.raises(ValueError, match="No values found"): + PineconeDocumentStore._get_metadata_field_min_max_impl(docs, "missing") + + +def test_get_metadata_field_unique_values_impl_pagination_search_and_lists(): + docs = [ + Document(content="a", meta={"tags": ["python", "java"]}), + Document(content="b", meta={"tags": ["rust", "go"]}), + Document(content="c", meta={"tags": ["python"]}), + ] + + values, total = PineconeDocumentStore._get_metadata_field_unique_values_impl( + docs, "tags", search_term=None, from_=0, size=10 + ) + assert total == 4 + assert values == ["go", "java", "python", "rust"] + + values, total = PineconeDocumentStore._get_metadata_field_unique_values_impl( + docs, "tags", search_term=None, from_=1, size=2 + ) + assert total == 4 + assert values == ["java", "python"] + + values, total = PineconeDocumentStore._get_metadata_field_unique_values_impl( + docs, "tags", search_term="PY", from_=0, size=10 + ) + assert total == 1 + assert values == ["python"] + + +def test_prepare_documents_for_writing_edge_cases(caplog): + ds = PineconeDocumentStore(api_key=Secret.from_token("fake-api-key")) + + with pytest.raises(ValueError, match="must contain a list of objects of type Document"): + ds._prepare_documents_for_writing(["not-a-document"], policy=DuplicatePolicy.NONE) + + docs = [ + Document(content="no-embedding"), + Document(content="with-blob", embedding=[0.1] * 768, blob=ByteStream(data=b"data")), + Document( + content="with-sparse", + embedding=[0.1] * 768, + sparse_embedding=SparseEmbedding(indices=[0], values=[1.0]), + ), + ] + with caplog.at_level("WARNING"): + result = ds._prepare_documents_for_writing(docs, policy=DuplicatePolicy.SKIP) + + assert len(result) == 3 + assert result[0][1] == ds._dummy_vector + assert "only supports `DuplicatePolicy.OVERWRITE`" in caplog.text + assert "has no embedding" in caplog.text + assert "blob" in caplog.text + assert "sparse_embedding" in caplog.text + + +@pytest.mark.asyncio +async def test_validation_errors_on_empty_query_and_non_dict_meta(): + ds = PineconeDocumentStore(api_key=Secret.from_token("fake-api-key")) + filters = {"field": "meta.category", "operator": "==", "value": "A"} + + with pytest.raises(ValueError, match="query_embedding must be a non-empty list"): + ds._embedding_retrieval(query_embedding=[]) + with pytest.raises(ValueError, match="query_embedding must be a non-empty list"): + await ds._embedding_retrieval_async(query_embedding=[]) + + with pytest.raises(ValueError, match="meta must be a dictionary"): + ds.update_by_filter(filters=filters, meta="not-a-dict") + with pytest.raises(ValueError, match="meta must be a dictionary"): + await ds.update_by_filter_async(filters=filters, meta="not-a-dict") + + @pytest.mark.integration @pytest.mark.skipif(not os.environ.get("PINECONE_API_KEY"), reason="PINECONE_API_KEY not set") def test_serverless_index_creation_from_scratch(delete_sleep_time): diff --git a/integrations/pinecone/tests/test_embedding_retriever.py b/integrations/pinecone/tests/test_embedding_retriever.py index 94ed0d8074..f7e37d49ac 100644 --- a/integrations/pinecone/tests/test_embedding_retriever.py +++ b/integrations/pinecone/tests/test_embedding_retriever.py @@ -28,6 +28,11 @@ def test_init_default(): PineconeEmbeddingRetriever(document_store=mock_store, filter_policy="invalid") +def test_init_raises_for_non_pinecone_document_store(): + with pytest.raises(ValueError, match="document_store must be an instance of PineconeDocumentStore"): + PineconeEmbeddingRetriever(document_store="not-a-document-store") + + @patch("haystack_integrations.document_stores.pinecone.document_store.Pinecone") def test_to_dict(mock_pinecone, monkeypatch): monkeypatch.setenv("PINECONE_API_KEY", "env-api-key") diff --git a/integrations/pinecone/tests/test_filters.py b/integrations/pinecone/tests/test_filters.py index d22488469b..1b61777ae4 100644 --- a/integrations/pinecone/tests/test_filters.py +++ b/integrations/pinecone/tests/test_filters.py @@ -2,10 +2,124 @@ import pytest from haystack.dataclasses.document import Document +from haystack.errors import FilterError from haystack.testing.document_store import ( FilterDocumentsTest, ) +from haystack_integrations.document_stores.pinecone.filters import ( + _normalize_filters, + _validate_filters, +) + + +def test_normalize_filters_rejects_non_dict(): + with pytest.raises(FilterError, match="Filters must be a dictionary"): + _normalize_filters("not-a-dict") + + +@pytest.mark.parametrize( + ("operator", "value", "expected"), + [ + ("==", "foo", {"field": {"$eq": "foo"}}), + ("!=", 5, {"field": {"$ne": 5}}), + (">", 1.5, {"field": {"$gt": 1.5}}), + (">=", 2, {"field": {"$gte": 2}}), + ("<", 3, {"field": {"$lt": 3}}), + ("<=", 4.2, {"field": {"$lte": 4.2}}), + ("in", ["a", "b"], {"field": {"$in": ["a", "b"]}}), + ("not in", [1, 2], {"field": {"$nin": [1, 2]}}), + ], +) +def test_comparison_operators(operator, value, expected): + condition = {"field": "field", "operator": operator, "value": value} + assert _normalize_filters(condition) == expected + + +@pytest.mark.parametrize( + ("operator", "value"), + [ + (">", "not-a-number"), + (">=", "not-a-number"), + ("<", "not-a-number"), + ("<=", "not-a-number"), + ("==", [1, 2]), + ("!=", [1, 2]), + ], +) +def test_comparison_rejects_unsupported_value_types(operator, value): + condition = {"field": "field", "operator": operator, "value": value} + with pytest.raises(FilterError, match="Unsupported type"): + _normalize_filters(condition) + + +@pytest.mark.parametrize( + ("operator", "value", "match"), + [ + ("in", "not-a-list", "must be a list"), + ("not in", "not-a-list", "must be a list"), + ("in", [{"nested": "dict"}], "Unsupported type"), + ("not in", [{"nested": "dict"}], "Unsupported type"), + ], +) +def test_in_and_not_in_errors(operator, value, match): + with pytest.raises(FilterError, match=match): + _normalize_filters({"field": "field", "operator": operator, "value": value}) + + +@pytest.mark.parametrize( + ("condition", "match"), + [ + ({"conditions": []}, "'operator' key missing"), + ({"operator": "AND"}, "'conditions' key missing"), + ( + {"operator": "XOR", "conditions": [{"field": "a", "operator": "==", "value": 1}]}, + "Unknown logical operator", + ), + ], +) +def test_logical_condition_errors(condition, match): + with pytest.raises(FilterError, match=match): + _normalize_filters(condition) + + +@pytest.mark.parametrize( + ("condition", "match"), + [ + ({"field": "a", "value": 1}, "'operator' key missing"), + ({"field": "a", "operator": "=="}, "'value' key missing"), + ], +) +def test_comparison_condition_errors(condition, match): + with pytest.raises(FilterError, match=match): + _normalize_filters(condition) + + +def test_meta_prefix_is_stripped(): + condition = {"field": "meta.category", "operator": "==", "value": "A"} + assert _normalize_filters(condition) == {"category": {"$eq": "A"}} + + +def test_nested_logical_conditions_are_parsed(): + filters = { + "operator": "AND", + "conditions": [ + { + "operator": "OR", + "conditions": [ + {"field": "a", "operator": "==", "value": 1}, + {"field": "b", "operator": ">", "value": 2}, + ], + }, + ], + } + assert _normalize_filters(filters) == {"$and": [{"$or": [{"a": {"$eq": 1}}, {"b": {"$gt": 2}}]}]} + + +def test_validate_filters_rejects_invalid_syntax(): + with pytest.raises(ValueError, match="Invalid filter syntax"): + _validate_filters({"foo": "bar"}) + @pytest.mark.integration @pytest.mark.skipif(not os.environ.get("PINECONE_API_KEY"), reason="PINECONE_API_KEY not set")