diff --git a/haystack/document_stores/in_memory/document_store.py b/haystack/document_stores/in_memory/document_store.py index b56d832536..9d706787b2 100644 --- a/haystack/document_stores/in_memory/document_store.py +++ b/haystack/document_stores/in_memory/document_store.py @@ -247,7 +247,7 @@ def _score_bm25okapi(self, query: str, documents: list[Document]) -> list[tuple[ The list of documents to score, should be produced by the filter_documents method; may be an empty list. :returns: - A list of tuples, each containing a Document and its BM25L score. + A list of tuples, each containing a Document and its BM25Okapi score. """ k = self.bm25_parameters.get("k1", 1.5) b = self.bm25_parameters.get("b", 0.75) @@ -274,7 +274,7 @@ def _compute_idf(tokens: list[str]) -> dict[str, float]: return {tok: idf.get(tok, 0.0) for tok in tokens} def _compute_tf(token: str, freq: dict[str, int], doc_len: int) -> float: - """Per-token BM25L computation.""" + """Per-token BM25Okapi computation.""" freq_term = freq.get(token, 0.0) freq_norm = freq_term + k * (1 - b + b * doc_len / self._avg_doc_len) return freq_term * (1.0 + k) / freq_norm @@ -376,7 +376,7 @@ def from_dict(cls, data: dict[str, Any]) -> "InMemoryDocumentStore": def save_to_disk(self, path: str) -> None: """ - Write the database and its' data to disk as a JSON file. + Write the database and its data to disk as a JSON file. :param path: The path to the JSON file. """ @@ -388,7 +388,7 @@ def save_to_disk(self, path: str) -> None: @classmethod def load_from_disk(cls, path: str) -> "InMemoryDocumentStore": """ - Load the database and its' data from disk as a JSON file. + Load the database and its data from disk as a JSON file. :param path: The path to the JSON file. :returns: The loaded InMemoryDocumentStore. @@ -411,7 +411,7 @@ def load_from_disk(cls, path: str) -> "InMemoryDocumentStore": def count_documents(self) -> int: """ - Returns the number of how many documents are present in the DocumentStore. + Returns the number of documents present in the DocumentStore. """ return len(self.storage.keys()) @@ -419,10 +419,8 @@ def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Docume """ Returns the documents that match the filters provided. - For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol - documentation. - - :param filters: The filters to apply to the document list. + :param filters: The filters to apply. For a detailed specification of the filters, refer to the + [documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering). :returns: A list of Documents that match the given filters. """ if filters: @@ -485,7 +483,7 @@ def delete_documents(self, document_ids: list[str]) -> None: """ Deletes all documents with matching document_ids from the DocumentStore. - :param document_ids: The object_ids to delete. + :param document_ids: The document_ids to delete. """ for doc_id in document_ids: if doc_id not in self.storage.keys(): @@ -551,6 +549,108 @@ def delete_by_filter(self, filters: dict[str, Any]) -> int: self.delete_documents(doc_ids) return len(doc_ids) + def count_documents_by_filter(self, filters: dict[str, Any]) -> int: + """ + Returns the number of documents that match the provided filters. + + :param filters: The filters to apply. + For a detailed specification of the filters, refer to the + [documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering). + :returns: The number of documents that match the filters. + """ + if filters: + InMemoryDocumentStore._validate_filters(filters) + return sum(1 for doc in self.storage.values() if document_matches_filter(filters=filters, document=doc)) + return len(self.storage) + + def count_unique_metadata_by_filter(self, filters: dict[str, Any], metadata_fields: list[str]) -> dict[str, int]: + """ + Returns the number of unique values for each specified metadata field from documents matching the filters. + + :param filters: The filters to apply. + For a detailed specification of the filters, refer to the + [documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering). + :param metadata_fields: List of field names to count unique values for. + Field names can include or omit the "meta." prefix. + :returns: A dictionary mapping each metadata field name (without "meta." prefix) + to the count of its unique values among the filtered documents. + """ + if filters: + InMemoryDocumentStore._validate_filters(filters) + docs = [doc for doc in self.storage.values() if document_matches_filter(filters=filters, document=doc)] + else: + docs = list(self.storage.values()) + + result: dict[str, int] = {} + for field in metadata_fields: + key = field.removeprefix("meta.") if field.startswith("meta.") else field + values = {doc.meta.get(key) for doc in docs if key in doc.meta and doc.meta[key] is not None} + result[key] = len(values) + return result + + def get_metadata_fields_info(self) -> dict[str, dict[str, str]]: + """ + Returns information about the metadata fields present in the stored documents. + + Types are inferred from the stored values (keyword, int, float, boolean). + + :returns: A dictionary mapping each metadata field name to a dict with a "type" key. + """ + type_map: dict[str, str] = {} + for doc in self.storage.values(): + for key, value in doc.meta.items(): + if value is None: + continue + if isinstance(value, bool): + type_map[key] = "boolean" + elif isinstance(value, int): + type_map[key] = "int" + elif isinstance(value, float): + type_map[key] = "float" + else: + type_map[key] = "keyword" + return {k: {"type": v} for k, v in type_map.items()} + + def get_metadata_field_min_max(self, metadata_field: str) -> dict[str, Any]: + """ + Returns the minimum and maximum values for the given metadata field across all documents. + + :param metadata_field: The metadata field name. Can include or omit the "meta." prefix. + :returns: A dictionary with "min" and "max" keys. Returns `{"min": None, "max": None}` + if the field is missing or has no values. + """ + key = metadata_field.removeprefix("meta.") if metadata_field.startswith("meta.") else metadata_field + values = [ + doc.meta[key] + for doc in self.storage.values() + if key in doc.meta and doc.meta[key] is not None and isinstance(doc.meta[key], (int, float, str)) + ] + if not values: + return {"min": None, "max": None} + try: + return {"min": min(values), "max": max(values)} + except TypeError: + return {"min": None, "max": None} + + def get_metadata_field_unique_values( + self, metadata_field: str, search_term: str | None = None + ) -> tuple[list[str], int]: + """ + Returns unique values for a metadata field, optionally filtered by a search term in content. + + :param metadata_field: The metadata field name. Can include or omit the "meta." prefix. + :param search_term: If set, only documents whose content contains this term (case-insensitive) + are considered. + :returns: A tuple of (list of unique values, total count of unique values). + """ + key = metadata_field.removeprefix("meta.") if metadata_field.startswith("meta.") else metadata_field + if search_term: + docs = [doc for doc in self.storage.values() if doc.content and search_term.lower() in doc.content.lower()] + else: + docs = list(self.storage.values()) + values = sorted({str(doc.meta[key]) for doc in docs if key in doc.meta and doc.meta[key] is not None}, key=str) + return values, len(values) + def bm25_retrieval( self, query: str, filters: dict[str, Any] | None = None, top_k: int = 10, scale_score: bool = False ) -> list[Document]: @@ -725,7 +825,7 @@ def _compute_query_embedding_similarity_scores( async def count_documents_async(self) -> int: """ - Returns the number of how many documents are present in the DocumentStore. + Returns the number of documents present in the DocumentStore. """ return len(self.storage.keys()) @@ -733,10 +833,8 @@ async def filter_documents_async(self, filters: dict[str, Any] | None = None) -> """ Returns the documents that match the filters provided. - For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol - documentation. - - :param filters: The filters to apply to the document list. + :param filters: The filters to apply. For a detailed specification of the filters, refer to the + [documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering). :returns: A list of Documents that match the given filters. """ return await asyncio.get_running_loop().run_in_executor( @@ -759,7 +857,7 @@ async def delete_documents_async(self, document_ids: list[str]) -> None: """ Deletes all documents with matching document_ids from the DocumentStore. - :param document_ids: The object_ids to delete. + :param document_ids: The document_ids to delete. """ await asyncio.get_running_loop().run_in_executor( self.executor, lambda: self.delete_documents(document_ids=document_ids) diff --git a/releasenotes/notes/add-missing-operations-in-memory-doc-store-4790cdd277b832ea.yaml b/releasenotes/notes/add-missing-operations-in-memory-doc-store-4790cdd277b832ea.yaml new file mode 100644 index 0000000000..fb21fc1e40 --- /dev/null +++ b/releasenotes/notes/add-missing-operations-in-memory-doc-store-4790cdd277b832ea.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Added new operations to the ``InMemoryDocumentStore``: count_documents_by_filter, count_unique_metadata_by_filter, get_metadata_fields_info, get_metadata_field_min_max, get_metadata_field_unique_values diff --git a/test/document_stores/test_in_memory.py b/test/document_stores/test_in_memory.py index 3d91ad345a..f98a6099be 100644 --- a/test/document_stores/test_in_memory.py +++ b/test/document_stores/test_in_memory.py @@ -13,10 +13,27 @@ from haystack import Document from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError from haystack.document_stores.in_memory import InMemoryDocumentStore -from haystack.testing.document_store import DocumentStoreBaseTests - - -class TestMemoryDocumentStore(DocumentStoreBaseTests): +from haystack.testing.document_store import ( + CountDocumentsByFilterTest, + CountUniqueMetadataByFilterTest, + DocumentStoreBaseExtendedTests, + DocumentStoreBaseTests, + FilterableDocsFixtureMixin, + GetMetadataFieldMinMaxTest, + GetMetadataFieldsInfoTest, + GetMetadataFieldUniqueValuesTest, +) + + +class TestMemoryDocumentStore( + DocumentStoreBaseExtendedTests, + CountDocumentsByFilterTest, + CountUniqueMetadataByFilterTest, + FilterableDocsFixtureMixin, + GetMetadataFieldMinMaxTest, + GetMetadataFieldUniqueValuesTest, + GetMetadataFieldsInfoTest, +): """ Test InMemoryDocumentStore's specific features """ @@ -109,88 +126,6 @@ def test_write_documents(self, document_store): with pytest.raises(DuplicateDocumentError): document_store.write_documents(docs) - def test_delete_all_documents(self, document_store: InMemoryDocumentStore): - docs = [Document(content="Doc 1", meta={"category": "A"}), Document(content="Doc 2", meta={"category": "B"})] - document_store.write_documents(docs) - assert document_store.count_documents() == 2 - - document_store.delete_all_documents() - assert document_store.count_documents() == 0 - assert document_store.filter_documents() == [] - - # Store remains functional after delete_all - document_store.write_documents([Document(content="New doc")]) - assert document_store.count_documents() == 1 - - def test_delete_all_documents_empty_store(self, document_store: InMemoryDocumentStore): - document_store.delete_all_documents() - assert document_store.count_documents() == 0 - - def test_update_by_filter(self, document_store: InMemoryDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023}), - Document(content="Doc 2", meta={"category": "B", "year": 2023}), - Document(content="Doc 3", meta={"category": "A", "year": 2024}), - ] - document_store.write_documents(docs) - - updated = document_store.update_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "A"}, meta={"updated": True, "tag": "foo"} - ) - assert updated == 2 - - all_docs = document_store.filter_documents() - category_a = [d for d in all_docs if d.meta.get("category") == "A"] - category_b = [d for d in all_docs if d.meta.get("category") == "B"] - assert len(category_a) == 2 - assert all(d.meta.get("updated") is True and d.meta.get("tag") == "foo" for d in category_a) - assert len(category_b) == 1 - assert "updated" not in category_b[0].meta and "tag" not in category_b[0].meta - - def test_update_by_filter_no_matches(self, document_store: InMemoryDocumentStore): - docs = [Document(content="Doc 1", meta={"category": "A"}), Document(content="Doc 2", meta={"category": "B"})] - document_store.write_documents(docs) - - updated = document_store.update_by_filter( - filters={"field": "meta.category", "operator": "==", "value": "C"}, meta={"updated": True} - ) - assert updated == 0 - assert document_store.count_documents() == 2 - - def test_delete_by_filter(self, document_store: InMemoryDocumentStore): - docs = [ - Document(content="Doc 1", meta={"category": "A", "year": 2023}), - Document(content="Doc 2", meta={"category": "B", "year": 2023}), - Document(content="Doc 3", meta={"category": "A", "year": 2024}), - ] - document_store.write_documents(docs) - assert document_store.count_documents() == 3 - - deleted = document_store.delete_by_filter(filters={"field": "meta.category", "operator": "==", "value": "A"}) - assert deleted == 2 - assert document_store.count_documents() == 1 - remaining = document_store.filter_documents() - assert remaining[0].meta["category"] == "B" - - deleted = document_store.delete_by_filter(filters={"field": "meta.year", "operator": "==", "value": 2023}) - assert deleted == 1 - assert document_store.count_documents() == 0 - - def test_delete_by_filter_no_matches(self, document_store: InMemoryDocumentStore): - docs = [Document(content="Doc 1", meta={"category": "A"}), Document(content="Doc 2", meta={"category": "B"})] - document_store.write_documents(docs) - - deleted = document_store.delete_by_filter(filters={"field": "meta.category", "operator": "==", "value": "C"}) - assert deleted == 0 - assert document_store.count_documents() == 2 - - def test_delete_by_filter_invalid_filters(self, document_store: InMemoryDocumentStore): - document_store.write_documents([Document(content="Doc 1")]) - with pytest.raises(ValueError, match="Invalid filter syntax"): - document_store.delete_by_filter(filters={"invalid": "filter"}) - with pytest.raises(ValueError, match="Invalid filter syntax"): - document_store.update_by_filter(filters={"invalid": "filter"}, meta={"key": "value"}) - def test_bm25_retrieval(self, document_store: InMemoryDocumentStore): # Tests if the bm25_retrieval method returns the correct document based on the input query. docs = [Document(content="Hello world"), Document(content="Haystack supports multiple languages")]