|
11 | 11 | from haystack import Document |
12 | 12 | from haystack.components.preprocessors import DocumentSplitter |
13 | 13 | from haystack.components.retrievers import SentenceWindowRetriever |
| 14 | +from haystack.dataclasses import ByteStream, SparseEmbedding |
| 15 | +from haystack.document_stores.types import DuplicatePolicy |
14 | 16 | from haystack.testing.document_store import ( |
15 | 17 | CountDocumentsByFilterTest, |
16 | 18 | CountDocumentsTest, |
@@ -230,6 +232,132 @@ def test_convert_meta_to_int(): |
230 | 232 | assert PineconeDocumentStore._convert_meta_to_int(meta_data) == {} |
231 | 233 |
|
232 | 234 |
|
| 235 | +@pytest.mark.parametrize( |
| 236 | + ("documents", "expected", "warning_fragment"), |
| 237 | + [ |
| 238 | + ([], {}, None), |
| 239 | + ( |
| 240 | + [Document(content="hello", meta={"flag": True})], |
| 241 | + {"content": {"type": "text"}, "flag": {"type": "boolean"}}, |
| 242 | + None, |
| 243 | + ), |
| 244 | + ( |
| 245 | + [Document(content=None, meta={"tags": ["a", "b"]})], |
| 246 | + {"tags": {"type": "keyword"}}, |
| 247 | + None, |
| 248 | + ), |
| 249 | + ( |
| 250 | + [Document(content=None, meta={"counts": [1, 2]})], |
| 251 | + {"counts": {"type": "long"}}, |
| 252 | + None, |
| 253 | + ), |
| 254 | + ( |
| 255 | + [Document(content=None, meta={"empty": []})], |
| 256 | + {"empty": {"type": "keyword"}}, |
| 257 | + None, |
| 258 | + ), |
| 259 | + ( |
| 260 | + [Document(content=None, meta={"pi": 3.14})], |
| 261 | + {"pi": {"type": "long"}}, |
| 262 | + None, |
| 263 | + ), |
| 264 | + ( |
| 265 | + [ |
| 266 | + Document(content=None, meta={"value": 1}), |
| 267 | + Document(content=None, meta={"value": "two"}), |
| 268 | + ], |
| 269 | + {"value": {"type": "keyword"}}, |
| 270 | + "mixed types", |
| 271 | + ), |
| 272 | + ], |
| 273 | +) |
| 274 | +def test_get_metadata_fields_info_impl_type_inference(documents, expected, warning_fragment, caplog): |
| 275 | + with caplog.at_level("WARNING"): |
| 276 | + result = PineconeDocumentStore._get_metadata_fields_info_impl(documents) |
| 277 | + assert result == expected |
| 278 | + if warning_fragment: |
| 279 | + assert warning_fragment in caplog.text |
| 280 | + |
| 281 | + |
| 282 | +def test_get_metadata_field_min_max_impl_strips_meta_prefix_and_errors(): |
| 283 | + docs = [ |
| 284 | + Document(content="a", meta={"priority": 1}), |
| 285 | + Document(content="b", meta={"priority": 5}), |
| 286 | + ] |
| 287 | + assert PineconeDocumentStore._get_metadata_field_min_max_impl(docs, "meta.priority") == {"min": 1, "max": 5} |
| 288 | + |
| 289 | + with pytest.raises(ValueError, match="No values found"): |
| 290 | + PineconeDocumentStore._get_metadata_field_min_max_impl(docs, "missing") |
| 291 | + |
| 292 | + |
| 293 | +def test_get_metadata_field_unique_values_impl_pagination_search_and_lists(): |
| 294 | + docs = [ |
| 295 | + Document(content="a", meta={"tags": ["python", "java"]}), |
| 296 | + Document(content="b", meta={"tags": ["rust", "go"]}), |
| 297 | + Document(content="c", meta={"tags": ["python"]}), |
| 298 | + ] |
| 299 | + |
| 300 | + values, total = PineconeDocumentStore._get_metadata_field_unique_values_impl( |
| 301 | + docs, "tags", search_term=None, from_=0, size=10 |
| 302 | + ) |
| 303 | + assert total == 4 |
| 304 | + assert values == ["go", "java", "python", "rust"] |
| 305 | + |
| 306 | + values, total = PineconeDocumentStore._get_metadata_field_unique_values_impl( |
| 307 | + docs, "tags", search_term=None, from_=1, size=2 |
| 308 | + ) |
| 309 | + assert total == 4 |
| 310 | + assert values == ["java", "python"] |
| 311 | + |
| 312 | + values, total = PineconeDocumentStore._get_metadata_field_unique_values_impl( |
| 313 | + docs, "tags", search_term="PY", from_=0, size=10 |
| 314 | + ) |
| 315 | + assert total == 1 |
| 316 | + assert values == ["python"] |
| 317 | + |
| 318 | + |
| 319 | +def test_prepare_documents_for_writing_edge_cases(caplog): |
| 320 | + ds = PineconeDocumentStore(api_key=Secret.from_token("fake-api-key")) |
| 321 | + |
| 322 | + with pytest.raises(ValueError, match="must contain a list of objects of type Document"): |
| 323 | + ds._prepare_documents_for_writing(["not-a-document"], policy=DuplicatePolicy.NONE) |
| 324 | + |
| 325 | + docs = [ |
| 326 | + Document(content="no-embedding"), |
| 327 | + Document(content="with-blob", embedding=[0.1] * 768, blob=ByteStream(data=b"data")), |
| 328 | + Document( |
| 329 | + content="with-sparse", |
| 330 | + embedding=[0.1] * 768, |
| 331 | + sparse_embedding=SparseEmbedding(indices=[0], values=[1.0]), |
| 332 | + ), |
| 333 | + ] |
| 334 | + with caplog.at_level("WARNING"): |
| 335 | + result = ds._prepare_documents_for_writing(docs, policy=DuplicatePolicy.SKIP) |
| 336 | + |
| 337 | + assert len(result) == 3 |
| 338 | + assert result[0][1] == ds._dummy_vector |
| 339 | + assert "only supports `DuplicatePolicy.OVERWRITE`" in caplog.text |
| 340 | + assert "has no embedding" in caplog.text |
| 341 | + assert "blob" in caplog.text |
| 342 | + assert "sparse_embedding" in caplog.text |
| 343 | + |
| 344 | + |
| 345 | +@pytest.mark.asyncio |
| 346 | +async def test_validation_errors_on_empty_query_and_non_dict_meta(): |
| 347 | + ds = PineconeDocumentStore(api_key=Secret.from_token("fake-api-key")) |
| 348 | + filters = {"field": "meta.category", "operator": "==", "value": "A"} |
| 349 | + |
| 350 | + with pytest.raises(ValueError, match="query_embedding must be a non-empty list"): |
| 351 | + ds._embedding_retrieval(query_embedding=[]) |
| 352 | + with pytest.raises(ValueError, match="query_embedding must be a non-empty list"): |
| 353 | + await ds._embedding_retrieval_async(query_embedding=[]) |
| 354 | + |
| 355 | + with pytest.raises(ValueError, match="meta must be a dictionary"): |
| 356 | + ds.update_by_filter(filters=filters, meta="not-a-dict") |
| 357 | + with pytest.raises(ValueError, match="meta must be a dictionary"): |
| 358 | + await ds.update_by_filter_async(filters=filters, meta="not-a-dict") |
| 359 | + |
| 360 | + |
233 | 361 | @pytest.mark.integration |
234 | 362 | @pytest.mark.skipif(not os.environ.get("PINECONE_API_KEY"), reason="PINECONE_API_KEY not set") |
235 | 363 | def test_serverless_index_creation_from_scratch(delete_sleep_time): |
|
0 commit comments