From ceea7b09cc9e9b4a1f1d910486c0c27de50dba36 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Mar 2026 15:51:00 +0100 Subject: [PATCH 01/11] adding new Mixin tests --- haystack/testing/document_store.py | 330 ++++++++++++++++++++++++++++- 1 file changed, 329 insertions(+), 1 deletion(-) diff --git a/haystack/testing/document_store.py b/haystack/testing/document_store.py index f659187614..4d1fe42593 100644 --- a/haystack/testing/document_store.py +++ b/haystack/testing/document_store.py @@ -76,7 +76,7 @@ class WriteDocumentsTest(AssertDocumentsEqualMixin): To use it create a custom test class and override the `document_store` fixture to return your Document Store. The Document Store `filter_documents` method must be at least partly implemented to return all stored Documents - for this tests to work correctly. + for these tests to work correctly. Example usage: ```python @@ -930,6 +930,334 @@ def test_update_by_filter_advanced_filters(document_store: DocumentStore): assert len(featured_docs) == 2 +class CountDocumentsByFilterTest: + """ + Tests for Document Store count_documents_by_filter(). + + Only mix in for stores that implement count_documents_by_filter. + """ + + @staticmethod + def test_count_documents_by_filter_simple(document_store: DocumentStore): + """Test count_documents_by_filter() with a simple equality filter.""" + docs = [ + Document(content="Doc 1", meta={"category": "A", "status": "active"}), + Document(content="Doc 2", meta={"category": "B", "status": "active"}), + Document(content="Doc 3", meta={"category": "A", "status": "inactive"}), + Document(content="Doc 4", meta={"category": "A", "status": "active"}), + ] + document_store.write_documents(docs) + assert document_store.count_documents() == 4 + + count = document_store.count_documents_by_filter( # type:ignore[attr-defined] + filters={"field": "meta.category", "operator": "==", "value": "A"} + ) + assert count == 3 + + count = document_store.count_documents_by_filter( # type:ignore[attr-defined] + filters={"field": "meta.category", "operator": "==", "value": "B"} + ) + assert count == 1 + + @staticmethod + def test_count_documents_by_filter_compound(document_store: DocumentStore): + """Test count_documents_by_filter() with AND filter.""" + docs = [ + Document(content="Doc 1", meta={"category": "A", "status": "active"}), + Document(content="Doc 2", meta={"category": "B", "status": "active"}), + Document(content="Doc 3", meta={"category": "A", "status": "inactive"}), + Document(content="Doc 4", meta={"category": "A", "status": "active"}), + ] + document_store.write_documents(docs) + assert document_store.count_documents() == 4 + + count = document_store.count_documents_by_filter( # type:ignore[attr-defined] + filters={ + "operator": "AND", + "conditions": [ + {"field": "meta.category", "operator": "==", "value": "A"}, + {"field": "meta.status", "operator": "==", "value": "active"}, + ], + } + ) + assert count == 2 + + @staticmethod + def test_count_documents_by_filter_no_matches(document_store: DocumentStore): + """Test count_documents_by_filter() when filter matches no documents.""" + docs = [Document(content="Doc 1", meta={"category": "A"}), Document(content="Doc 2", meta={"category": "B"})] + document_store.write_documents(docs) + assert document_store.count_documents() == 2 + + count = document_store.count_documents_by_filter( # type:ignore[attr-defined] + filters={"field": "meta.category", "operator": "==", "value": "Z"} + ) + assert count == 0 + + @staticmethod + def test_count_documents_by_filter_empty_collection(document_store: DocumentStore): + """Test count_documents_by_filter() on an empty store.""" + assert document_store.count_documents() == 0 + + count = document_store.count_documents_by_filter( # type:ignore[attr-defined] + filters={"field": "meta.category", "operator": "==", "value": "A"} + ) + assert count == 0 + + +class CountUniqueMetadataByFilterTest: + """ + Tests for Document Store count_unique_metadata_by_filter(). + + Only mix in for stores that implement count_unique_metadata_by_filter. + """ + + @staticmethod + def test_count_unique_metadata_by_filter_all_documents(document_store: DocumentStore): + """Test count_unique_metadata_by_filter() with no filter returns distinct counts for all docs.""" + docs = [ + Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1}), + Document(content="Doc 2", meta={"category": "B", "status": "active", "priority": 2}), + Document(content="Doc 3", meta={"category": "A", "status": "inactive", "priority": 1}), + Document(content="Doc 4", meta={"category": "A", "status": "active", "priority": 3}), + Document(content="Doc 5", meta={"category": "C", "status": "active", "priority": 2}), + ] + document_store.write_documents(docs) + assert document_store.count_documents() == 5 + + counts = document_store.count_unique_metadata_by_filter( # type:ignore[attr-defined] + filters={}, metadata_fields=["category", "status", "priority"] + ) + assert counts["category"] == 3 + assert counts["status"] == 2 + assert counts["priority"] == 3 + + @staticmethod + def test_count_unique_metadata_by_filter_with_filter(document_store: DocumentStore): + """Test count_unique_metadata_by_filter() with a filter.""" + docs = [ + Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1}), + Document(content="Doc 2", meta={"category": "B", "status": "active", "priority": 2}), + Document(content="Doc 3", meta={"category": "A", "status": "inactive", "priority": 1}), + Document(content="Doc 4", meta={"category": "A", "status": "active", "priority": 3}), + ] + document_store.write_documents(docs) + assert document_store.count_documents() == 4 + + counts = document_store.count_unique_metadata_by_filter( # type:ignore[attr-defined] + filters={"field": "meta.category", "operator": "==", "value": "A"}, metadata_fields=["status", "priority"] + ) + assert counts["status"] == 2 + assert counts["priority"] == 2 + + @staticmethod + def test_count_unique_metadata_by_filter_empty_collection(document_store: DocumentStore): + """Test count_unique_metadata_by_filter() on an empty store.""" + assert document_store.count_documents() == 0 + + counts = document_store.count_unique_metadata_by_filter( # type:ignore[attr-defined] + filters={}, metadata_fields=["category", "status"] + ) + assert counts["category"] == 0 + assert counts["status"] == 0 + + +class GetMetadataFieldsInfoTest: + """ + Tests for Document Store get_metadata_fields_info(). + + Only mix in for stores that implement get_metadata_fields_info. + """ + + @staticmethod + def test_get_metadata_fields_info(document_store: DocumentStore): + """Test get_metadata_fields_info() returns field names and types after writing documents.""" + docs = [ + Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1}), + Document(content="Doc 2", meta={"category": "B", "status": "inactive", "score": 0.5}), + ] + document_store.write_documents(docs) + assert document_store.count_documents() == 2 + + fields_info = document_store.get_metadata_fields_info() # type:ignore[attr-defined] + + assert "category" in fields_info + assert "status" in fields_info + assert "priority" in fields_info + assert "score" in fields_info + for field_name, info in fields_info.items(): # noqa: B007, PERF102 + assert isinstance(info, dict) + assert "type" in info + + @staticmethod + def test_get_metadata_fields_info_empty_collection(document_store: DocumentStore): + """Test get_metadata_fields_info() on an empty store.""" + assert document_store.count_documents() == 0 + + fields_info = document_store.get_metadata_fields_info() # type:ignore[attr-defined] + assert fields_info == {} + + +class GetMetadataFieldMinMaxTest: + """ + Tests for Document Store get_metadata_field_min_max(). + + Only mix in for stores that implement get_metadata_field_min_max. + """ + + @staticmethod + def test_get_metadata_field_min_max_numeric(document_store: DocumentStore): + """Test get_metadata_field_min_max() with integer field.""" + docs = [ + Document(content="Doc 1", meta={"priority": 1}), + Document(content="Doc 2", meta={"priority": 5}), + Document(content="Doc 3", meta={"priority": 3}), + Document(content="Doc 4", meta={"priority": 10}), + ] + document_store.write_documents(docs) + assert document_store.count_documents() == 4 + + result = document_store.get_metadata_field_min_max("priority") # type:ignore[attr-defined] + assert result["min"] == 1 + assert result["max"] == 10 + + @staticmethod + def test_get_metadata_field_min_max_float(document_store: DocumentStore): + """Test get_metadata_field_min_max() with float field.""" + docs = [ + Document(content="Doc 1", meta={"score": 0.6}), + Document(content="Doc 2", meta={"score": 0.95}), + Document(content="Doc 3", meta={"score": 0.8}), + ] + document_store.write_documents(docs) + assert document_store.count_documents() == 3 + + result = document_store.get_metadata_field_min_max("score") # type:ignore[attr-defined] + assert result["min"] == pytest.approx(0.6) + assert result["max"] == pytest.approx(0.95) + + @staticmethod + def test_get_metadata_field_min_max_single_value(document_store: DocumentStore): + """Test get_metadata_field_min_max() when field has only one value.""" + docs = [Document(content="Doc 1", meta={"priority": 42})] + document_store.write_documents(docs) + assert document_store.count_documents() == 1 + + result = document_store.get_metadata_field_min_max("priority") # type:ignore[attr-defined] + assert result["min"] == 42 + assert result["max"] == 42 + + @staticmethod + def test_get_metadata_field_min_max_empty_collection(document_store: DocumentStore): + """Test get_metadata_field_min_max() on an empty store.""" + assert document_store.count_documents() == 0 + + result = document_store.get_metadata_field_min_max("priority") # type:ignore[attr-defined] + assert result["min"] is None + assert result["max"] is None + + +class GetMetadataFieldUniqueValuesTest: + """ + Tests for Document Store get_metadata_field_unique_values(). + + Only mix in for stores that implement get_metadata_field_unique_values. + Expects the method to return (values_list, total_count) or (values_list, pagination_key). + """ + + @staticmethod + def test_get_metadata_field_unique_values_basic(document_store: DocumentStore): + """Test get_metadata_field_unique_values() returns unique values and total count.""" + docs = [ + Document(content="Doc 1", meta={"category": "A"}), + Document(content="Doc 2", meta={"category": "B"}), + Document(content="Doc 3", meta={"category": "A"}), + Document(content="Doc 4", meta={"category": "C"}), + Document(content="Doc 5", meta={"category": "B"}), + ] + document_store.write_documents(docs) + assert document_store.count_documents() == 5 + + sig = inspect.signature(document_store.get_metadata_field_unique_values) # type:ignore[attr-defined] + params: dict = {} + if "search_term" in sig.parameters: + params["search_term"] = None + if "from_" in sig.parameters: + params["from_"] = 0 + elif "offset" in sig.parameters: + params["offset"] = 0 + if "size" in sig.parameters: + params["size"] = 10 + elif "limit" in sig.parameters: + params["limit"] = 10 + + result = document_store.get_metadata_field_unique_values("category", **params) # type:ignore[attr-defined] + + values = result[0] if isinstance(result, tuple) else result + assert isinstance(values, list) + assert set(values) == {"A", "B", "C"} + if isinstance(result, tuple) and len(result) >= 2 and isinstance(result[1], int): + assert result[1] == 3 + + @staticmethod + def test_get_metadata_field_unique_values_pagination(document_store: DocumentStore): + """Test get_metadata_field_unique_values() with pagination (from_ and size).""" + docs = [ + Document(content="Doc 1", meta={"category": "A"}), + Document(content="Doc 2", meta={"category": "B"}), + Document(content="Doc 3", meta={"category": "C"}), + Document(content="Doc 4", meta={"category": "D"}), + Document(content="Doc 5", meta={"category": "E"}), + ] + document_store.write_documents(docs) + assert document_store.count_documents() == 5 + + sig = inspect.signature(document_store.get_metadata_field_unique_values) # type:ignore[attr-defined] + if "from_" not in sig.parameters and "offset" not in sig.parameters: + pytest.skip("get_metadata_field_unique_values does not support pagination (from_/offset)") + + params_first: dict = {} + if "from_" in sig.parameters: + params_first["from_"] = 0 + params_first["size"] = 2 + elif "offset" in sig.parameters: + params_first["offset"] = 0 + params_first["limit"] = 2 + else: + pytest.skip("get_metadata_field_unique_values does not support pagination") + + result_first = document_store.get_metadata_field_unique_values("category", **params_first) # type:ignore[attr-defined] + values_first = result_first[0] if isinstance(result_first, tuple) else result_first + total = result_first[1] if isinstance(result_first, tuple) and len(result_first) >= 2 else len(values_first) + + assert len(values_first) == 2 + assert total == 5 + + @staticmethod + def test_get_metadata_field_unique_values_empty_collection(document_store: DocumentStore): + """Test get_metadata_field_unique_values() on an empty store.""" + assert document_store.count_documents() == 0 + + sig = inspect.signature(document_store.get_metadata_field_unique_values) # type:ignore[attr-defined] + params: dict = {} + if "search_term" in sig.parameters: + params["search_term"] = None + if "from_" in sig.parameters: + params["from_"] = 0 + elif "offset" in sig.parameters: + params["offset"] = 0 + if "size" in sig.parameters: + params["size"] = 10 + elif "limit" in sig.parameters: + params["limit"] = 10 + + result = document_store.get_metadata_field_unique_values("category", **params) # type:ignore[attr-defined] + values = result[0] if isinstance(result, tuple) else result + assert values == [] + if isinstance(result, tuple) and len(result) >= 2 and isinstance(result[1], int): + assert result[1] == 0 + + class DocumentStoreBaseTests(CountDocumentsTest, DeleteDocumentsTest, FilterDocumentsTest, WriteDocumentsTest): @pytest.fixture def document_store(self) -> DocumentStore: From 9b57bf0421f0148367e2d47de41e4564dfa0f9ac Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Mar 2026 16:44:15 +0100 Subject: [PATCH 02/11] adding more operations + using tests from Mixin --- .../in_memory/document_store.py | 104 ++++++++++++++++ test/document_stores/test_in_memory.py | 112 ++++-------------- 2 files changed, 130 insertions(+), 86 deletions(-) diff --git a/haystack/document_stores/in_memory/document_store.py b/haystack/document_stores/in_memory/document_store.py index 28b643c54b..d7fb784c6b 100644 --- a/haystack/document_stores/in_memory/document_store.py +++ b/haystack/document_stores/in_memory/document_store.py @@ -549,6 +549,110 @@ def delete_by_filter(self, filters: dict[str, Any]) -> int: self.delete_documents(doc_ids) return len(doc_ids) + def count_documents_by_filter(self, filters: dict[str, Any]) -> int: + """ + Returns the number of documents that match the provided filters. + + :param filters: The filters to apply. For filter syntax, see filter_documents. + :returns: The number of documents that match the filters. + """ + if filters: + InMemoryDocumentStore._validate_filters(filters) + return sum(1 for doc in self.storage.values() if document_matches_filter(filters=filters, document=doc)) + return len(self.storage) + + def count_unique_metadata_by_filter(self, filters: dict[str, Any], metadata_fields: list[str]) -> dict[str, int]: + """ + Returns the number of unique values for each specified metadata field from documents matching the filters. + + :param filters: The filters to apply. For filter syntax, see filter_documents. + :param metadata_fields: List of field names to count unique values for. + Field names can include or omit the "meta." prefix. + :returns: A dictionary mapping each metadata field name (without "meta." prefix) + to the count of its unique values among the filtered documents. + """ + if filters: + InMemoryDocumentStore._validate_filters(filters) + docs = [doc for doc in self.storage.values() if document_matches_filter(filters=filters, document=doc)] + else: + docs = list(self.storage.values()) + + result: dict[str, int] = {} + for field in metadata_fields: + key = field.removeprefix("meta.") if field.startswith("meta.") else field + values = {doc.meta.get(key) for doc in docs if key in doc.meta and doc.meta[key] is not None} + result[key] = len(values) + return result + + def get_metadata_fields_info(self) -> dict[str, dict[str, str]]: + """ + Returns information about the metadata fields present in the stored documents. + + Types are inferred from the stored values (keyword, long, float, boolean). + + :returns: A dictionary mapping each metadata field name to a dict with a "type" key. + """ + type_map: dict[str, str] = {} + for doc in self.storage.values(): + for key, value in doc.meta.items(): + if value is None: + continue + if isinstance(value, bool): + type_map[key] = "boolean" + elif isinstance(value, int): + type_map[key] = "long" + elif isinstance(value, float): + type_map[key] = "float" + else: + type_map[key] = "keyword" + return {k: {"type": v} for k, v in type_map.items()} + + def get_metadata_field_min_max(self, metadata_field: str) -> dict[str, Any]: + """ + Returns the minimum and maximum values for the given metadata field across all documents. + + :param metadata_field: The metadata field name. Can include or omit the "meta." prefix. + :returns: A dictionary with "min" and "max" keys. Returns {"min": None, "max": None} + if the field is missing or has no values. + """ + key = metadata_field.removeprefix("meta.") if metadata_field.startswith("meta.") else metadata_field + values = [ + doc.meta[key] + for doc in self.storage.values() + if key in doc.meta and doc.meta[key] is not None and isinstance(doc.meta[key], (int, float, str)) + ] + if not values: + return {"min": None, "max": None} + try: + return {"min": min(values), "max": max(values)} + except TypeError: + return {"min": None, "max": None} + + def get_metadata_field_unique_values( + self, metadata_field: str, search_term: str | None = None, from_: int = 0, size: int = 10 + ) -> tuple[list[str], int]: + """ + Returns unique values for a metadata field. + + Optionally filtered by a search term in content with pagination support. + + :param metadata_field: The metadata field name. Can include or omit the "meta." prefix. + :param search_term: If set, only documents whose content contains this term (case-insensitive) + are considered. + :param from_: Offset for pagination. + :param size: Maximum number of unique values to return. + :returns: A tuple (list of unique values for the requested page, total count of unique values). + """ + key = metadata_field.removeprefix("meta.") if metadata_field.startswith("meta.") else metadata_field + if search_term: + docs = [doc for doc in self.storage.values() if doc.content and search_term.lower() in doc.content.lower()] + else: + docs = list(self.storage.values()) + values = sorted({str(doc.meta[key]) for doc in docs if key in doc.meta and doc.meta[key] is not None}, key=str) + total = len(values) + page = values[from_ : from_ + size] + return (page, total) + def bm25_retrieval( self, query: str, filters: dict[str, Any] | None = None, top_k: int = 10, scale_score: bool = False ) -> list[Document]: diff --git a/test/document_stores/test_in_memory.py b/test/document_stores/test_in_memory.py index 08f0fd27a2..d1cdd67198 100644 --- a/test/document_stores/test_in_memory.py +++ b/test/document_stores/test_in_memory.py @@ -13,10 +13,32 @@ from haystack import Document from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError from haystack.document_stores.in_memory import InMemoryDocumentStore -from haystack.testing.document_store import DocumentStoreBaseTests - - -class TestMemoryDocumentStore(DocumentStoreBaseTests): +from haystack.testing.document_store import ( + CountDocumentsByFilterTest, + CountUniqueMetadataByFilterTest, + DeleteAllTest, + DeleteByFilterTest, + DocumentStoreBaseTests, + FilterableDocsFixtureMixin, + GetMetadataFieldMinMaxTest, + GetMetadataFieldsInfoTest, + GetMetadataFieldUniqueValuesTest, + UpdateByFilterTest, +) + + +class TestMemoryDocumentStore( + CountDocumentsByFilterTest, + CountUniqueMetadataByFilterTest, + DeleteAllTest, + DeleteByFilterTest, + DocumentStoreBaseTests, + FilterableDocsFixtureMixin, + GetMetadataFieldMinMaxTest, + GetMetadataFieldsInfoTest, + GetMetadataFieldUniqueValuesTest, + UpdateByFilterTest, +): """ Test InMemoryDocumentStore's specific features """ @@ -109,88 +131,6 @@ def test_write_documents(self, document_store): with pytest.raises(DuplicateDocumentError): document_store.write_documents(docs) - def test_delete_all_documents(self, document_store: InMemoryDocumentStore): - docs = [Document(content="Doc 1", meta={"category": "A"}), Document(content="Doc 2", meta={"category": "B"})] - document_store.write_documents(docs) - assert document_store.count_documents() == 2 - - document_store.delete_all_documents() - assert document_store.count_documents() == 0 - assert document_store.filter_documents() == [] - - # Store remains functional after delete_all - document_store.write_documents([Document(content="New doc")]) - assert document_store.count_documents() == 1 - - def test_delete_all_documents_empty_store(self, document_store: InMemoryDocumentStore): - document_store.delete_all_documents() - assert document_store.count_documents() == 0 - - def test_update_by_filter(self, document_store: InMemoryDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023}), - Document(content="Doc 2", meta={"category": "B", "year": 2023}), - Document(content="Doc 3", meta={"category": "A", "year": 2024}), - ] - document_store.write_documents(docs) - - updated = document_store.update_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "A"}, meta={"updated": True, "tag": "foo"} - ) - assert updated == 2 - - all_docs = document_store.filter_documents() - category_a = [d for d in all_docs if d.meta.get("category") == "A"] - category_b = [d for d in all_docs if d.meta.get("category") == "B"] - assert len(category_a) == 2 - assert all(d.meta.get("updated") is True and d.meta.get("tag") == "foo" for d in category_a) - assert len(category_b) == 1 - assert "updated" not in category_b[0].meta and "tag" not in category_b[0].meta - - def test_update_by_filter_no_matches(self, document_store: InMemoryDocumentStore): - docs = [Document(content="Doc 1", meta={"category": "A"}), Document(content="Doc 2", meta={"category": "B"})] - document_store.write_documents(docs) - - updated = document_store.update_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "C"}, meta={"updated": True} - ) - assert updated == 0 - assert document_store.count_documents() == 2 - - def test_delete_by_filter(self, document_store: InMemoryDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023}), - Document(content="Doc 2", meta={"category": "B", "year": 2023}), - Document(content="Doc 3", meta={"category": "A", "year": 2024}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - deleted = document_store.delete_by_filter(filters={"field": "meta.category", "operator": "==", "value": "A"}) - assert deleted == 2 - assert document_store.count_documents() == 1 - remaining = document_store.filter_documents() - assert remaining[0].meta["category"] == "B" - - deleted = document_store.delete_by_filter(filters={"field": "meta.year", "operator": "==", "value": 2023}) - assert deleted == 1 - assert document_store.count_documents() == 0 - - def test_delete_by_filter_no_matches(self, document_store: InMemoryDocumentStore): - docs = [Document(content="Doc 1", meta={"category": "A"}), Document(content="Doc 2", meta={"category": "B"})] - document_store.write_documents(docs) - - deleted = document_store.delete_by_filter(filters={"field": "meta.category", "operator": "==", "value": "C"}) - assert deleted == 0 - assert document_store.count_documents() == 2 - - def test_delete_by_filter_invalid_filters(self, document_store: InMemoryDocumentStore): - document_store.write_documents([Document(content="Doc 1")]) - with pytest.raises(ValueError, match="Invalid filter syntax"): - document_store.delete_by_filter(filters={"invalid": "filter"}) - with pytest.raises(ValueError, match="Invalid filter syntax"): - document_store.update_by_filter(filters={"invalid": "filter"}, meta={"key": "value"}) - def test_bm25_retrieval(self, document_store: InMemoryDocumentStore): # Tests if the bm25_retrieval method returns the correct document based on the input query. docs = [Document(content="Hello world"), Document(content="Haystack supports multiple languages")] From f2dcce72789d75c68af443b79fe93a3d1718166e Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Mar 2026 16:48:28 +0100 Subject: [PATCH 03/11] reverting changes that were wrongly introduced from another branch --- haystack/testing/document_store.py | 328 ----------------------------- 1 file changed, 328 deletions(-) diff --git a/haystack/testing/document_store.py b/haystack/testing/document_store.py index 4d1fe42593..9756a1e329 100644 --- a/haystack/testing/document_store.py +++ b/haystack/testing/document_store.py @@ -930,334 +930,6 @@ def test_update_by_filter_advanced_filters(document_store: DocumentStore): assert len(featured_docs) == 2 -class CountDocumentsByFilterTest: - """ - Tests for Document Store count_documents_by_filter(). - - Only mix in for stores that implement count_documents_by_filter. - """ - - @staticmethod - def test_count_documents_by_filter_simple(document_store: DocumentStore): - """Test count_documents_by_filter() with a simple equality filter.""" - docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "active"}), - Document(content="Doc 2", meta={"category": "B", "status": "active"}), - Document(content="Doc 3", meta={"category": "A", "status": "inactive"}), - Document(content="Doc 4", meta={"category": "A", "status": "active"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 4 - - count = document_store.count_documents_by_filter( # type:ignore[attr-defined] - filters={"field": "meta.category", "operator": "==", "value": "A"} - ) - assert count == 3 - - count = document_store.count_documents_by_filter( # type:ignore[attr-defined] - filters={"field": "meta.category", "operator": "==", "value": "B"} - ) - assert count == 1 - - @staticmethod - def test_count_documents_by_filter_compound(document_store: DocumentStore): - """Test count_documents_by_filter() with AND filter.""" - docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "active"}), - Document(content="Doc 2", meta={"category": "B", "status": "active"}), - Document(content="Doc 3", meta={"category": "A", "status": "inactive"}), - Document(content="Doc 4", meta={"category": "A", "status": "active"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 4 - - count = document_store.count_documents_by_filter( # type:ignore[attr-defined] - filters={ - "operator": "AND", - "conditions": [ - {"field": "meta.category", "operator": "==", "value": "A"}, - {"field": "meta.status", "operator": "==", "value": "active"}, - ], - } - ) - assert count == 2 - - @staticmethod - def test_count_documents_by_filter_no_matches(document_store: DocumentStore): - """Test count_documents_by_filter() when filter matches no documents.""" - docs = [Document(content="Doc 1", meta={"category": "A"}), Document(content="Doc 2", meta={"category": "B"})] - document_store.write_documents(docs) - assert document_store.count_documents() == 2 - - count = document_store.count_documents_by_filter( # type:ignore[attr-defined] - filters={"field": "meta.category", "operator": "==", "value": "Z"} - ) - assert count == 0 - - @staticmethod - def test_count_documents_by_filter_empty_collection(document_store: DocumentStore): - """Test count_documents_by_filter() on an empty store.""" - assert document_store.count_documents() == 0 - - count = document_store.count_documents_by_filter( # type:ignore[attr-defined] - filters={"field": "meta.category", "operator": "==", "value": "A"} - ) - assert count == 0 - - -class CountUniqueMetadataByFilterTest: - """ - Tests for Document Store count_unique_metadata_by_filter(). - - Only mix in for stores that implement count_unique_metadata_by_filter. - """ - - @staticmethod - def test_count_unique_metadata_by_filter_all_documents(document_store: DocumentStore): - """Test count_unique_metadata_by_filter() with no filter returns distinct counts for all docs.""" - docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1}), - Document(content="Doc 2", meta={"category": "B", "status": "active", "priority": 2}), - Document(content="Doc 3", meta={"category": "A", "status": "inactive", "priority": 1}), - Document(content="Doc 4", meta={"category": "A", "status": "active", "priority": 3}), - Document(content="Doc 5", meta={"category": "C", "status": "active", "priority": 2}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 5 - - counts = document_store.count_unique_metadata_by_filter( # type:ignore[attr-defined] - filters={}, metadata_fields=["category", "status", "priority"] - ) - assert counts["category"] == 3 - assert counts["status"] == 2 - assert counts["priority"] == 3 - - @staticmethod - def test_count_unique_metadata_by_filter_with_filter(document_store: DocumentStore): - """Test count_unique_metadata_by_filter() with a filter.""" - docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1}), - Document(content="Doc 2", meta={"category": "B", "status": "active", "priority": 2}), - Document(content="Doc 3", meta={"category": "A", "status": "inactive", "priority": 1}), - Document(content="Doc 4", meta={"category": "A", "status": "active", "priority": 3}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 4 - - counts = document_store.count_unique_metadata_by_filter( # type:ignore[attr-defined] - filters={"field": "meta.category", "operator": "==", "value": "A"}, metadata_fields=["status", "priority"] - ) - assert counts["status"] == 2 - assert counts["priority"] == 2 - - @staticmethod - def test_count_unique_metadata_by_filter_empty_collection(document_store: DocumentStore): - """Test count_unique_metadata_by_filter() on an empty store.""" - assert document_store.count_documents() == 0 - - counts = document_store.count_unique_metadata_by_filter( # type:ignore[attr-defined] - filters={}, metadata_fields=["category", "status"] - ) - assert counts["category"] == 0 - assert counts["status"] == 0 - - -class GetMetadataFieldsInfoTest: - """ - Tests for Document Store get_metadata_fields_info(). - - Only mix in for stores that implement get_metadata_fields_info. - """ - - @staticmethod - def test_get_metadata_fields_info(document_store: DocumentStore): - """Test get_metadata_fields_info() returns field names and types after writing documents.""" - docs = [ - Document(content="Doc 1", meta={"category": "A", "status": "active", "priority": 1}), - Document(content="Doc 2", meta={"category": "B", "status": "inactive", "score": 0.5}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 2 - - fields_info = document_store.get_metadata_fields_info() # type:ignore[attr-defined] - - assert "category" in fields_info - assert "status" in fields_info - assert "priority" in fields_info - assert "score" in fields_info - for field_name, info in fields_info.items(): # noqa: B007, PERF102 - assert isinstance(info, dict) - assert "type" in info - - @staticmethod - def test_get_metadata_fields_info_empty_collection(document_store: DocumentStore): - """Test get_metadata_fields_info() on an empty store.""" - assert document_store.count_documents() == 0 - - fields_info = document_store.get_metadata_fields_info() # type:ignore[attr-defined] - assert fields_info == {} - - -class GetMetadataFieldMinMaxTest: - """ - Tests for Document Store get_metadata_field_min_max(). - - Only mix in for stores that implement get_metadata_field_min_max. - """ - - @staticmethod - def test_get_metadata_field_min_max_numeric(document_store: DocumentStore): - """Test get_metadata_field_min_max() with integer field.""" - docs = [ - Document(content="Doc 1", meta={"priority": 1}), - Document(content="Doc 2", meta={"priority": 5}), - Document(content="Doc 3", meta={"priority": 3}), - Document(content="Doc 4", meta={"priority": 10}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 4 - - result = document_store.get_metadata_field_min_max("priority") # type:ignore[attr-defined] - assert result["min"] == 1 - assert result["max"] == 10 - - @staticmethod - def test_get_metadata_field_min_max_float(document_store: DocumentStore): - """Test get_metadata_field_min_max() with float field.""" - docs = [ - Document(content="Doc 1", meta={"score": 0.6}), - Document(content="Doc 2", meta={"score": 0.95}), - Document(content="Doc 3", meta={"score": 0.8}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - result = document_store.get_metadata_field_min_max("score") # type:ignore[attr-defined] - assert result["min"] == pytest.approx(0.6) - assert result["max"] == pytest.approx(0.95) - - @staticmethod - def test_get_metadata_field_min_max_single_value(document_store: DocumentStore): - """Test get_metadata_field_min_max() when field has only one value.""" - docs = [Document(content="Doc 1", meta={"priority": 42})] - document_store.write_documents(docs) - assert document_store.count_documents() == 1 - - result = document_store.get_metadata_field_min_max("priority") # type:ignore[attr-defined] - assert result["min"] == 42 - assert result["max"] == 42 - - @staticmethod - def test_get_metadata_field_min_max_empty_collection(document_store: DocumentStore): - """Test get_metadata_field_min_max() on an empty store.""" - assert document_store.count_documents() == 0 - - result = document_store.get_metadata_field_min_max("priority") # type:ignore[attr-defined] - assert result["min"] is None - assert result["max"] is None - - -class GetMetadataFieldUniqueValuesTest: - """ - Tests for Document Store get_metadata_field_unique_values(). - - Only mix in for stores that implement get_metadata_field_unique_values. - Expects the method to return (values_list, total_count) or (values_list, pagination_key). - """ - - @staticmethod - def test_get_metadata_field_unique_values_basic(document_store: DocumentStore): - """Test get_metadata_field_unique_values() returns unique values and total count.""" - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - Document(content="Doc 3", meta={"category": "A"}), - Document(content="Doc 4", meta={"category": "C"}), - Document(content="Doc 5", meta={"category": "B"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 5 - - sig = inspect.signature(document_store.get_metadata_field_unique_values) # type:ignore[attr-defined] - params: dict = {} - if "search_term" in sig.parameters: - params["search_term"] = None - if "from_" in sig.parameters: - params["from_"] = 0 - elif "offset" in sig.parameters: - params["offset"] = 0 - if "size" in sig.parameters: - params["size"] = 10 - elif "limit" in sig.parameters: - params["limit"] = 10 - - result = document_store.get_metadata_field_unique_values("category", **params) # type:ignore[attr-defined] - - values = result[0] if isinstance(result, tuple) else result - assert isinstance(values, list) - assert set(values) == {"A", "B", "C"} - if isinstance(result, tuple) and len(result) >= 2 and isinstance(result[1], int): - assert result[1] == 3 - - @staticmethod - def test_get_metadata_field_unique_values_pagination(document_store: DocumentStore): - """Test get_metadata_field_unique_values() with pagination (from_ and size).""" - docs = [ - Document(content="Doc 1", meta={"category": "A"}), - Document(content="Doc 2", meta={"category": "B"}), - Document(content="Doc 3", meta={"category": "C"}), - Document(content="Doc 4", meta={"category": "D"}), - Document(content="Doc 5", meta={"category": "E"}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 5 - - sig = inspect.signature(document_store.get_metadata_field_unique_values) # type:ignore[attr-defined] - if "from_" not in sig.parameters and "offset" not in sig.parameters: - pytest.skip("get_metadata_field_unique_values does not support pagination (from_/offset)") - - params_first: dict = {} - if "from_" in sig.parameters: - params_first["from_"] = 0 - params_first["size"] = 2 - elif "offset" in sig.parameters: - params_first["offset"] = 0 - params_first["limit"] = 2 - else: - pytest.skip("get_metadata_field_unique_values does not support pagination") - - result_first = document_store.get_metadata_field_unique_values("category", **params_first) # type:ignore[attr-defined] - values_first = result_first[0] if isinstance(result_first, tuple) else result_first - total = result_first[1] if isinstance(result_first, tuple) and len(result_first) >= 2 else len(values_first) - - assert len(values_first) == 2 - assert total == 5 - - @staticmethod - def test_get_metadata_field_unique_values_empty_collection(document_store: DocumentStore): - """Test get_metadata_field_unique_values() on an empty store.""" - assert document_store.count_documents() == 0 - - sig = inspect.signature(document_store.get_metadata_field_unique_values) # type:ignore[attr-defined] - params: dict = {} - if "search_term" in sig.parameters: - params["search_term"] = None - if "from_" in sig.parameters: - params["from_"] = 0 - elif "offset" in sig.parameters: - params["offset"] = 0 - if "size" in sig.parameters: - params["size"] = 10 - elif "limit" in sig.parameters: - params["limit"] = 10 - - result = document_store.get_metadata_field_unique_values("category", **params) # type:ignore[attr-defined] - values = result[0] if isinstance(result, tuple) else result - assert values == [] - if isinstance(result, tuple) and len(result) >= 2 and isinstance(result[1], int): - assert result[1] == 0 - - class DocumentStoreBaseTests(CountDocumentsTest, DeleteDocumentsTest, FilterDocumentsTest, WriteDocumentsTest): @pytest.fixture def document_store(self) -> DocumentStore: From 806d2522f27caa664a52f34cccb1110bf660440b Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Mar 2026 16:49:01 +0100 Subject: [PATCH 04/11] reverting changes that were wrongly introduced from another branch --- haystack/testing/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/testing/document_store.py b/haystack/testing/document_store.py index 9756a1e329..f659187614 100644 --- a/haystack/testing/document_store.py +++ b/haystack/testing/document_store.py @@ -76,7 +76,7 @@ class WriteDocumentsTest(AssertDocumentsEqualMixin): To use it create a custom test class and override the `document_store` fixture to return your Document Store. The Document Store `filter_documents` method must be at least partly implemented to return all stored Documents - for these tests to work correctly. + for this tests to work correctly. Example usage: ```python From 7d556bdd78dcc986d7e43571132c849a774cafc7 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Mar 2026 17:01:46 +0100 Subject: [PATCH 05/11] updating tests --- test/document_stores/test_in_memory.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/test/document_stores/test_in_memory.py b/test/document_stores/test_in_memory.py index d1cdd67198..7904e1cde5 100644 --- a/test/document_stores/test_in_memory.py +++ b/test/document_stores/test_in_memory.py @@ -14,30 +14,16 @@ from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError from haystack.document_stores.in_memory import InMemoryDocumentStore from haystack.testing.document_store import ( - CountDocumentsByFilterTest, - CountUniqueMetadataByFilterTest, DeleteAllTest, DeleteByFilterTest, DocumentStoreBaseTests, FilterableDocsFixtureMixin, - GetMetadataFieldMinMaxTest, - GetMetadataFieldsInfoTest, - GetMetadataFieldUniqueValuesTest, UpdateByFilterTest, ) class TestMemoryDocumentStore( - CountDocumentsByFilterTest, - CountUniqueMetadataByFilterTest, - DeleteAllTest, - DeleteByFilterTest, - DocumentStoreBaseTests, - FilterableDocsFixtureMixin, - GetMetadataFieldMinMaxTest, - GetMetadataFieldsInfoTest, - GetMetadataFieldUniqueValuesTest, - UpdateByFilterTest, + DeleteAllTest, DeleteByFilterTest, DocumentStoreBaseTests, FilterableDocsFixtureMixin, UpdateByFilterTest ): """ Test InMemoryDocumentStore's specific features From 8ea51cd7eaf6b2e2d44a2465dd377fc8830b8f77 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 12 Mar 2026 17:03:47 +0100 Subject: [PATCH 06/11] adding release notes --- ...ssing-operations-in-memory-doc-store-4790cdd277b832ea.yaml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 releasenotes/notes/add-missing-operations-in-memory-doc-store-4790cdd277b832ea.yaml diff --git a/releasenotes/notes/add-missing-operations-in-memory-doc-store-4790cdd277b832ea.yaml b/releasenotes/notes/add-missing-operations-in-memory-doc-store-4790cdd277b832ea.yaml new file mode 100644 index 0000000000..fb21fc1e40 --- /dev/null +++ b/releasenotes/notes/add-missing-operations-in-memory-doc-store-4790cdd277b832ea.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Added new operations to the ``InMemoryDocumentStore``: count_documents_by_filter, count_unique_metadata_by_filter, get_metadata_fields_info, get_metadata_field_min_max, get_metadata_field_unique_values From d80e605bb237250290c5ac8d954fef29cd7f6723 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 18 Mar 2026 12:02:43 +0100 Subject: [PATCH 07/11] Update haystack/document_stores/in_memory/document_store.py Co-authored-by: Stefano Fiorucci --- haystack/document_stores/in_memory/document_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/haystack/document_stores/in_memory/document_store.py b/haystack/document_stores/in_memory/document_store.py index b102b07258..fdb20342a2 100644 --- a/haystack/document_stores/in_memory/document_store.py +++ b/haystack/document_stores/in_memory/document_store.py @@ -612,7 +612,7 @@ def get_metadata_field_min_max(self, metadata_field: str) -> dict[str, Any]: Returns the minimum and maximum values for the given metadata field across all documents. :param metadata_field: The metadata field name. Can include or omit the "meta." prefix. - :returns: A dictionary with "min" and "max" keys. Returns {"min": None, "max": None} + :returns: A dictionary with "min" and "max" keys. Returns `{"min": None, "max": None}` if the field is missing or has no values. """ key = metadata_field.removeprefix("meta.") if metadata_field.startswith("meta.") else metadata_field From 1e1772ca346ecda01ca91d90a0d647fc8c740bd9 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 18 Mar 2026 16:35:45 +0100 Subject: [PATCH 08/11] adding more Mixin tests --- test/document_stores/test_in_memory.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/test/document_stores/test_in_memory.py b/test/document_stores/test_in_memory.py index 92ba98926b..f3659a4841 100644 --- a/test/document_stores/test_in_memory.py +++ b/test/document_stores/test_in_memory.py @@ -14,16 +14,30 @@ from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError from haystack.document_stores.in_memory import InMemoryDocumentStore from haystack.testing.document_store import ( + CountDocumentsByFilterTest, + CountUniqueMetadataByFilterTest, DeleteAllTest, DeleteByFilterTest, DocumentStoreBaseTests, FilterableDocsFixtureMixin, + GetMetadataFieldMinMaxTest, + GetMetadataFieldsInfoTest, + GetMetadataFieldUniqueValuesTest, UpdateByFilterTest, ) class TestMemoryDocumentStore( - DeleteAllTest, DeleteByFilterTest, DocumentStoreBaseTests, FilterableDocsFixtureMixin, UpdateByFilterTest + CountDocumentsByFilterTest, + CountUniqueMetadataByFilterTest, + DeleteAllTest, + DeleteByFilterTest, + DocumentStoreBaseTests, + FilterableDocsFixtureMixin, + GetMetadataFieldMinMaxTest, + GetMetadataFieldUniqueValuesTest, + GetMetadataFieldsInfoTest, + UpdateByFilterTest, ): """ Test InMemoryDocumentStore's specific features From d6d81a638d3ed71be5dd1a6e249c73e57418db42 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Wed, 18 Mar 2026 16:51:04 +0100 Subject: [PATCH 09/11] using inherentance class --- test/document_stores/test_in_memory.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/test/document_stores/test_in_memory.py b/test/document_stores/test_in_memory.py index f3659a4841..f98a6099be 100644 --- a/test/document_stores/test_in_memory.py +++ b/test/document_stores/test_in_memory.py @@ -16,28 +16,23 @@ from haystack.testing.document_store import ( CountDocumentsByFilterTest, CountUniqueMetadataByFilterTest, - DeleteAllTest, - DeleteByFilterTest, + DocumentStoreBaseExtendedTests, DocumentStoreBaseTests, FilterableDocsFixtureMixin, GetMetadataFieldMinMaxTest, GetMetadataFieldsInfoTest, GetMetadataFieldUniqueValuesTest, - UpdateByFilterTest, ) class TestMemoryDocumentStore( + DocumentStoreBaseExtendedTests, CountDocumentsByFilterTest, CountUniqueMetadataByFilterTest, - DeleteAllTest, - DeleteByFilterTest, - DocumentStoreBaseTests, FilterableDocsFixtureMixin, GetMetadataFieldMinMaxTest, GetMetadataFieldUniqueValuesTest, GetMetadataFieldsInfoTest, - UpdateByFilterTest, ): """ Test InMemoryDocumentStore's specific features From 5da06f84724180700c8a83398c67425ca5f7a720 Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 19 Mar 2026 09:54:18 +0100 Subject: [PATCH 10/11] fixes/improvements from PR comments --- .../in_memory/document_store.py | 24 +++++++++---------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/haystack/document_stores/in_memory/document_store.py b/haystack/document_stores/in_memory/document_store.py index fb4590d0c5..07b476aee9 100644 --- a/haystack/document_stores/in_memory/document_store.py +++ b/haystack/document_stores/in_memory/document_store.py @@ -555,7 +555,9 @@ def count_documents_by_filter(self, filters: dict[str, Any]) -> int: """ Returns the number of documents that match the provided filters. - :param filters: The filters to apply. For filter syntax, see filter_documents. + :param filters: The filters to apply. + For a detailed specification of the filters, refer to the + [documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering). :returns: The number of documents that match the filters. """ if filters: @@ -567,7 +569,9 @@ def count_unique_metadata_by_filter(self, filters: dict[str, Any], metadata_fiel """ Returns the number of unique values for each specified metadata field from documents matching the filters. - :param filters: The filters to apply. For filter syntax, see filter_documents. + :param filters: The filters to apply. + For a detailed specification of the filters, refer to the + [documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering). :param metadata_fields: List of field names to count unique values for. Field names can include or omit the "meta." prefix. :returns: A dictionary mapping each metadata field name (without "meta." prefix) @@ -602,7 +606,7 @@ def get_metadata_fields_info(self) -> dict[str, dict[str, str]]: if isinstance(value, bool): type_map[key] = "boolean" elif isinstance(value, int): - type_map[key] = "long" + type_map[key] = "int" elif isinstance(value, float): type_map[key] = "float" else: @@ -631,19 +635,15 @@ def get_metadata_field_min_max(self, metadata_field: str) -> dict[str, Any]: return {"min": None, "max": None} def get_metadata_field_unique_values( - self, metadata_field: str, search_term: str | None = None, from_: int = 0, size: int = 10 + self, metadata_field: str, search_term: str | None = None ) -> tuple[list[str], int]: """ - Returns unique values for a metadata field. - - Optionally filtered by a search term in content with pagination support. + Returns unique values for a metadata field, optionally filtered by a search term in content. :param metadata_field: The metadata field name. Can include or omit the "meta." prefix. :param search_term: If set, only documents whose content contains this term (case-insensitive) are considered. - :param from_: Offset for pagination. - :param size: Maximum number of unique values to return. - :returns: A tuple (list of unique values for the requested page, total count of unique values). + :returns: A tuple of (list of unique values, total count of unique values). """ key = metadata_field.removeprefix("meta.") if metadata_field.startswith("meta.") else metadata_field if search_term: @@ -651,9 +651,7 @@ def get_metadata_field_unique_values( else: docs = list(self.storage.values()) values = sorted({str(doc.meta[key]) for doc in docs if key in doc.meta and doc.meta[key] is not None}, key=str) - total = len(values) - page = values[from_ : from_ + size] - return (page, total) + return values, len(values) def bm25_retrieval( self, query: str, filters: dict[str, Any] | None = None, top_k: int = 10, scale_score: bool = False From 093ee17ec451ed4e517f5c9e4d75519521438b1f Mon Sep 17 00:00:00 2001 From: "David S. Batista" Date: Thu, 19 Mar 2026 10:24:07 +0100 Subject: [PATCH 11/11] fixing a few more docstrings --- .../in_memory/document_store.py | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/haystack/document_stores/in_memory/document_store.py b/haystack/document_stores/in_memory/document_store.py index 07b476aee9..9d706787b2 100644 --- a/haystack/document_stores/in_memory/document_store.py +++ b/haystack/document_stores/in_memory/document_store.py @@ -247,7 +247,7 @@ def _score_bm25okapi(self, query: str, documents: list[Document]) -> list[tuple[ The list of documents to score, should be produced by the filter_documents method; may be an empty list. :returns: - A list of tuples, each containing a Document and its BM25L score. + A list of tuples, each containing a Document and its BM25Okapi score. """ k = self.bm25_parameters.get("k1", 1.5) b = self.bm25_parameters.get("b", 0.75) @@ -274,7 +274,7 @@ def _compute_idf(tokens: list[str]) -> dict[str, float]: return {tok: idf.get(tok, 0.0) for tok in tokens} def _compute_tf(token: str, freq: dict[str, int], doc_len: int) -> float: - """Per-token BM25L computation.""" + """Per-token BM25Okapi computation.""" freq_term = freq.get(token, 0.0) freq_norm = freq_term + k * (1 - b + b * doc_len / self._avg_doc_len) return freq_term * (1.0 + k) / freq_norm @@ -376,7 +376,7 @@ def from_dict(cls, data: dict[str, Any]) -> "InMemoryDocumentStore": def save_to_disk(self, path: str) -> None: """ - Write the database and its' data to disk as a JSON file. + Write the database and its data to disk as a JSON file. :param path: The path to the JSON file. """ @@ -388,7 +388,7 @@ def save_to_disk(self, path: str) -> None: @classmethod def load_from_disk(cls, path: str) -> "InMemoryDocumentStore": """ - Load the database and its' data from disk as a JSON file. + Load the database and its data from disk as a JSON file. :param path: The path to the JSON file. :returns: The loaded InMemoryDocumentStore. @@ -411,7 +411,7 @@ def load_from_disk(cls, path: str) -> "InMemoryDocumentStore": def count_documents(self) -> int: """ - Returns the number of how many documents are present in the DocumentStore. + Returns the number of documents present in the DocumentStore. """ return len(self.storage.keys()) @@ -419,10 +419,8 @@ def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Docume """ Returns the documents that match the filters provided. - For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol - documentation. - - :param filters: The filters to apply to the document list. + :param filters: The filters to apply. For a detailed specification of the filters, refer to the + [documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering). :returns: A list of Documents that match the given filters. """ if filters: @@ -485,7 +483,7 @@ def delete_documents(self, document_ids: list[str]) -> None: """ Deletes all documents with matching document_ids from the DocumentStore. - :param document_ids: The object_ids to delete. + :param document_ids: The document_ids to delete. """ for doc_id in document_ids: if doc_id not in self.storage.keys(): @@ -594,7 +592,7 @@ def get_metadata_fields_info(self) -> dict[str, dict[str, str]]: """ Returns information about the metadata fields present in the stored documents. - Types are inferred from the stored values (keyword, long, float, boolean). + Types are inferred from the stored values (keyword, int, float, boolean). :returns: A dictionary mapping each metadata field name to a dict with a "type" key. """ @@ -827,7 +825,7 @@ def _compute_query_embedding_similarity_scores( async def count_documents_async(self) -> int: """ - Returns the number of how many documents are present in the DocumentStore. + Returns the number of documents present in the DocumentStore. """ return len(self.storage.keys()) @@ -835,10 +833,8 @@ async def filter_documents_async(self, filters: dict[str, Any] | None = None) -> """ Returns the documents that match the filters provided. - For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol - documentation. - - :param filters: The filters to apply to the document list. + :param filters: The filters to apply. For a detailed specification of the filters, refer to the + [documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering). :returns: A list of Documents that match the given filters. """ return await asyncio.get_running_loop().run_in_executor( @@ -861,7 +857,7 @@ async def delete_documents_async(self, document_ids: list[str]) -> None: """ Deletes all documents with matching document_ids from the DocumentStore. - :param document_ids: The object_ids to delete. + :param document_ids: The document_ids to delete. """ await asyncio.get_running_loop().run_in_executor( self.executor, lambda: self.delete_documents(document_ids=document_ids)