1010from haystack .document_stores .errors import DocumentStoreError , DuplicateDocumentError
1111from haystack .document_stores .types import DuplicatePolicy
1212from haystack .testing .document_store import (
13+ CountDocumentsByFilterTest ,
1314 CountDocumentsTest ,
15+ CountUniqueMetadataByFilterTest ,
1416 DeleteAllTest ,
1517 DeleteByFilterTest ,
1618 DeleteDocumentsTest ,
1719 FilterableDocsFixtureMixin ,
20+ GetMetadataFieldMinMaxTest ,
21+ GetMetadataFieldsInfoTest ,
22+ GetMetadataFieldUniqueValuesTest ,
1823 UpdateByFilterTest ,
1924 WriteDocumentsTest ,
2025)
@@ -32,7 +37,26 @@ class TestDocumentStore(
3237 FilterableDocsFixtureMixin ,
3338 UpdateByFilterTest ,
3439 WriteDocumentsTest ,
40+ CountDocumentsByFilterTest ,
41+ CountUniqueMetadataByFilterTest ,
42+ GetMetadataFieldsInfoTest ,
43+ GetMetadataFieldMinMaxTest ,
44+ GetMetadataFieldUniqueValuesTest ,
3545):
46+ def test_get_metadata_fields_info_empty_collection (self , document_store : PgvectorDocumentStore ):
47+ """PgvectorDocumentStore always includes 'content' in fields info, even for empty stores."""
48+ assert document_store .count_documents () == 0
49+
50+ fields_info = document_store .get_metadata_fields_info ()
51+ assert fields_info == {"content" : {"type" : "text" }}
52+
53+ def test_get_metadata_field_min_max_empty_collection (self , document_store : PgvectorDocumentStore ):
54+ """PgvectorDocumentStore raises ValueError when the field doesn't exist in the store."""
55+ assert document_store .count_documents () == 0
56+
57+ with pytest .raises (ValueError , match = "not found in document store" ):
58+ document_store .get_metadata_field_min_max ("priority" )
59+
3660 def test_write_documents (self , document_store : PgvectorDocumentStore ):
3761 docs = [Document (id = "1" )]
3862 assert document_store .write_documents (docs ) == 1
@@ -280,151 +304,8 @@ def test_update_by_filter_empty_meta_raises_error(document_store: PgvectorDocume
280304 document_store .update_by_filter (filters = {"field" : "meta.category" , "operator" : "==" , "value" : "A" }, meta = {})
281305
282306
283- @pytest .mark .integration
284- def test_count_documents_by_filter (document_store : PgvectorDocumentStore ):
285- docs = [
286- Document (content = "Doc 1" , meta = {"category" : "A" , "status" : "active" }),
287- Document (content = "Doc 2" , meta = {"category" : "B" , "status" : "active" }),
288- Document (content = "Doc 3" , meta = {"category" : "A" , "status" : "inactive" }),
289- Document (content = "Doc 4" , meta = {"category" : "A" , "status" : "active" }),
290- ]
291- document_store .write_documents (docs )
292-
293- count_a = document_store .count_documents_by_filter (
294- filters = {"field" : "meta.category" , "operator" : "==" , "value" : "A" }
295- )
296- assert count_a == 3
297-
298- count_a_active = document_store .count_documents_by_filter (
299- filters = {
300- "operator" : "AND" ,
301- "conditions" : [
302- {"field" : "meta.category" , "operator" : "==" , "value" : "A" },
303- {"field" : "meta.status" , "operator" : "==" , "value" : "active" },
304- ],
305- }
306- )
307- assert count_a_active == 2
308-
309-
310- @pytest .mark .integration
311- def test_count_unique_metadata_by_filter (document_store : PgvectorDocumentStore ):
312- docs = [
313- Document (content = "Doc 1" , meta = {"category" : "A" , "status" : "active" , "priority" : 1 }),
314- Document (content = "Doc 2" , meta = {"category" : "B" , "status" : "active" , "priority" : 2 }),
315- Document (content = "Doc 3" , meta = {"category" : "A" , "status" : "inactive" , "priority" : 1 }),
316- Document (content = "Doc 4" , meta = {"category" : "A" , "status" : "active" , "priority" : 3 }),
317- Document (content = "Doc 5" , meta = {"category" : "C" , "status" : "active" , "priority" : 2 }),
318- ]
319- document_store .write_documents (docs )
320-
321- distinct_counts = document_store .count_unique_metadata_by_filter (
322- filters = {}, metadata_fields = ["category" , "status" , "priority" ]
323- )
324- assert distinct_counts ["category" ] == 3 # A, B, C
325- assert distinct_counts ["status" ] == 2 # active, inactive
326- assert distinct_counts ["priority" ] == 3 # 1, 2, 3
327-
328- # distinct values for documents with category="A"
329- distinct_counts_a = document_store .count_unique_metadata_by_filter (
330- filters = {"field" : "meta.category" , "operator" : "==" , "value" : "A" },
331- metadata_fields = ["category" , "status" , "priority" ],
332- )
333- assert distinct_counts_a ["category" ] == 1 # Only A
334- assert distinct_counts_a ["status" ] == 2 # active, inactive
335- assert distinct_counts_a ["priority" ] == 2 # 1, 3
336-
337- # distinct values with complex filter (category="A" AND status="active")
338- distinct_counts_a_active = document_store .count_unique_metadata_by_filter (
339- filters = {
340- "operator" : "AND" ,
341- "conditions" : [
342- {"field" : "meta.category" , "operator" : "==" , "value" : "A" },
343- {"field" : "meta.status" , "operator" : "==" , "value" : "active" },
344- ],
345- },
346- metadata_fields = ["category" , "status" , "priority" ],
347- )
348- assert distinct_counts_a_active ["category" ] == 1 # Only A
349- assert distinct_counts_a_active ["status" ] == 1 # Only active
350- assert distinct_counts_a_active ["priority" ] == 2 # 1, 3
351-
352- # with only a subset of fields
353- distinct_counts_subset = document_store .count_unique_metadata_by_filter (
354- filters = {}, metadata_fields = ["category" , "status" ]
355- )
356- assert distinct_counts_subset ["category" ] == 3
357- assert distinct_counts_subset ["status" ] == 2
358- assert "priority" not in distinct_counts_subset
359-
360- # with field name normalization (with "meta." prefix)
361- distinct_counts_normalized = document_store .count_unique_metadata_by_filter (
362- filters = {}, metadata_fields = ["meta.category" , "status" , "meta.priority" ]
363- )
364- assert distinct_counts_normalized ["category" ] == 3
365- assert distinct_counts_normalized ["status" ] == 2
366- assert distinct_counts_normalized ["priority" ] == 3
367-
368-
369- @pytest .mark .integration
370- def test_get_metadata_fields_info (document_store : PgvectorDocumentStore ):
371- docs = [
372- Document (content = "Doc 1" , meta = {"category" : "A" , "status" : "active" , "priority" : 1 }),
373- Document (content = "Doc 2" , meta = {"category" : "B" , "status" : "inactive" }),
374- ]
375- document_store .write_documents (docs )
376-
377- fields_info = document_store .get_metadata_fields_info ()
378-
379- # Verify that fields_info contains expected fields
380- assert "content" in fields_info
381- assert "category" in fields_info
382- assert "status" in fields_info
383- assert "priority" in fields_info
384-
385- assert fields_info ["content" ]["type" ] == "text"
386- assert fields_info ["category" ]["type" ] == "text"
387- assert fields_info ["status" ]["type" ] == "text"
388- assert fields_info ["priority" ]["type" ] == "integer"
389-
390-
391307@pytest .mark .integration
392308def test_get_metadata_field_min_max (document_store : PgvectorDocumentStore ):
393- # Test with integer values
394- docs = [
395- Document (content = "Doc 1" , meta = {"priority" : 1 , "age" : 10 }),
396- Document (content = "Doc 2" , meta = {"priority" : 5 , "age" : 20 }),
397- Document (content = "Doc 3" , meta = {"priority" : 3 , "age" : 15 }),
398- Document (content = "Doc 4" , meta = {"priority" : 10 , "age" : 5 }),
399- Document (content = "Doc 6" , meta = {"rating" : 10.5 }),
400- Document (content = "Doc 7" , meta = {"rating" : 20.3 }),
401- Document (content = "Doc 8" , meta = {"rating" : 15.7 }),
402- Document (content = "Doc 9" , meta = {"rating" : 5.2 }),
403- ]
404- document_store .write_documents (docs )
405-
406- # Test with "meta." prefix for integer field
407- min_max_priority = document_store .get_metadata_field_min_max ("meta.priority" )
408- assert min_max_priority ["min" ] == 1
409- assert min_max_priority ["max" ] == 10
410-
411- # Test with "meta." prefix for another integer field
412- min_max_age = document_store .get_metadata_field_min_max ("meta.age" )
413- assert min_max_age ["min" ] == 5
414- assert min_max_age ["max" ] == 20
415-
416- # Test with single value
417- single_doc = [Document (content = "Doc 5" , meta = {"single_value" : 42 })]
418- document_store .write_documents (single_doc )
419- min_max_single = document_store .get_metadata_field_min_max ("meta.single_value" )
420- assert min_max_single ["min" ] == 42
421- assert min_max_single ["max" ] == 42
422-
423- # Test with float values
424- min_max_rating = document_store .get_metadata_field_min_max ("meta.rating" )
425- assert min_max_rating ["min" ] == pytest .approx (5.2 )
426- assert min_max_rating ["max" ] == pytest .approx (20.3 )
427-
428309 # Test with text/string values - lexicographic comparison
429310 text_docs = [
430311 Document (content = "Doc 1" , meta = {"category" : "Zebra" , "status" : "active" }),
0 commit comments