Skip to content

Commit 2d82965

Browse files
feat: add missing operations to InMemoryDocumentStore (#10805)
* adding new Mixin tests * adding more operations + using tests from Mixin * reverting changes that were wrongly introduced from another branch * reverting changes that were wrongly introduced from another branch * updating tests * adding release notes * Update haystack/document_stores/in_memory/document_store.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> * adding more Mixin tests * using inherentance class * fixes/improvements from PR comments * fixing a few more docstrings --------- Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
1 parent 50dc87c commit 2d82965

3 files changed

Lines changed: 139 additions & 102 deletions

File tree

haystack/document_stores/in_memory/document_store.py

Lines changed: 114 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ def _score_bm25okapi(self, query: str, documents: list[Document]) -> list[tuple[
247247
The list of documents to score, should be produced by
248248
the filter_documents method; may be an empty list.
249249
:returns:
250-
A list of tuples, each containing a Document and its BM25L score.
250+
A list of tuples, each containing a Document and its BM25Okapi score.
251251
"""
252252
k = self.bm25_parameters.get("k1", 1.5)
253253
b = self.bm25_parameters.get("b", 0.75)
@@ -274,7 +274,7 @@ def _compute_idf(tokens: list[str]) -> dict[str, float]:
274274
return {tok: idf.get(tok, 0.0) for tok in tokens}
275275

276276
def _compute_tf(token: str, freq: dict[str, int], doc_len: int) -> float:
277-
"""Per-token BM25L computation."""
277+
"""Per-token BM25Okapi computation."""
278278
freq_term = freq.get(token, 0.0)
279279
freq_norm = freq_term + k * (1 - b + b * doc_len / self._avg_doc_len)
280280
return freq_term * (1.0 + k) / freq_norm
@@ -376,7 +376,7 @@ def from_dict(cls, data: dict[str, Any]) -> "InMemoryDocumentStore":
376376

377377
def save_to_disk(self, path: str) -> None:
378378
"""
379-
Write the database and its' data to disk as a JSON file.
379+
Write the database and its data to disk as a JSON file.
380380
381381
:param path: The path to the JSON file.
382382
"""
@@ -388,7 +388,7 @@ def save_to_disk(self, path: str) -> None:
388388
@classmethod
389389
def load_from_disk(cls, path: str) -> "InMemoryDocumentStore":
390390
"""
391-
Load the database and its' data from disk as a JSON file.
391+
Load the database and its data from disk as a JSON file.
392392
393393
:param path: The path to the JSON file.
394394
:returns: The loaded InMemoryDocumentStore.
@@ -411,18 +411,16 @@ def load_from_disk(cls, path: str) -> "InMemoryDocumentStore":
411411

412412
def count_documents(self) -> int:
413413
"""
414-
Returns the number of how many documents are present in the DocumentStore.
414+
Returns the number of documents present in the DocumentStore.
415415
"""
416416
return len(self.storage.keys())
417417

418418
def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Document]:
419419
"""
420420
Returns the documents that match the filters provided.
421421
422-
For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol
423-
documentation.
424-
425-
:param filters: The filters to apply to the document list.
422+
:param filters: The filters to apply. For a detailed specification of the filters, refer to the
423+
[documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering).
426424
:returns: A list of Documents that match the given filters.
427425
"""
428426
if filters:
@@ -485,7 +483,7 @@ def delete_documents(self, document_ids: list[str]) -> None:
485483
"""
486484
Deletes all documents with matching document_ids from the DocumentStore.
487485
488-
:param document_ids: The object_ids to delete.
486+
:param document_ids: The document_ids to delete.
489487
"""
490488
for doc_id in document_ids:
491489
if doc_id not in self.storage.keys():
@@ -551,6 +549,108 @@ def delete_by_filter(self, filters: dict[str, Any]) -> int:
551549
self.delete_documents(doc_ids)
552550
return len(doc_ids)
553551

552+
def count_documents_by_filter(self, filters: dict[str, Any]) -> int:
553+
"""
554+
Returns the number of documents that match the provided filters.
555+
556+
:param filters: The filters to apply.
557+
For a detailed specification of the filters, refer to the
558+
[documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering).
559+
:returns: The number of documents that match the filters.
560+
"""
561+
if filters:
562+
InMemoryDocumentStore._validate_filters(filters)
563+
return sum(1 for doc in self.storage.values() if document_matches_filter(filters=filters, document=doc))
564+
return len(self.storage)
565+
566+
def count_unique_metadata_by_filter(self, filters: dict[str, Any], metadata_fields: list[str]) -> dict[str, int]:
567+
"""
568+
Returns the number of unique values for each specified metadata field from documents matching the filters.
569+
570+
:param filters: The filters to apply.
571+
For a detailed specification of the filters, refer to the
572+
[documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering).
573+
:param metadata_fields: List of field names to count unique values for.
574+
Field names can include or omit the "meta." prefix.
575+
:returns: A dictionary mapping each metadata field name (without "meta." prefix)
576+
to the count of its unique values among the filtered documents.
577+
"""
578+
if filters:
579+
InMemoryDocumentStore._validate_filters(filters)
580+
docs = [doc for doc in self.storage.values() if document_matches_filter(filters=filters, document=doc)]
581+
else:
582+
docs = list(self.storage.values())
583+
584+
result: dict[str, int] = {}
585+
for field in metadata_fields:
586+
key = field.removeprefix("meta.") if field.startswith("meta.") else field
587+
values = {doc.meta.get(key) for doc in docs if key in doc.meta and doc.meta[key] is not None}
588+
result[key] = len(values)
589+
return result
590+
591+
def get_metadata_fields_info(self) -> dict[str, dict[str, str]]:
592+
"""
593+
Returns information about the metadata fields present in the stored documents.
594+
595+
Types are inferred from the stored values (keyword, int, float, boolean).
596+
597+
:returns: A dictionary mapping each metadata field name to a dict with a "type" key.
598+
"""
599+
type_map: dict[str, str] = {}
600+
for doc in self.storage.values():
601+
for key, value in doc.meta.items():
602+
if value is None:
603+
continue
604+
if isinstance(value, bool):
605+
type_map[key] = "boolean"
606+
elif isinstance(value, int):
607+
type_map[key] = "int"
608+
elif isinstance(value, float):
609+
type_map[key] = "float"
610+
else:
611+
type_map[key] = "keyword"
612+
return {k: {"type": v} for k, v in type_map.items()}
613+
614+
def get_metadata_field_min_max(self, metadata_field: str) -> dict[str, Any]:
615+
"""
616+
Returns the minimum and maximum values for the given metadata field across all documents.
617+
618+
:param metadata_field: The metadata field name. Can include or omit the "meta." prefix.
619+
:returns: A dictionary with "min" and "max" keys. Returns `{"min": None, "max": None}`
620+
if the field is missing or has no values.
621+
"""
622+
key = metadata_field.removeprefix("meta.") if metadata_field.startswith("meta.") else metadata_field
623+
values = [
624+
doc.meta[key]
625+
for doc in self.storage.values()
626+
if key in doc.meta and doc.meta[key] is not None and isinstance(doc.meta[key], (int, float, str))
627+
]
628+
if not values:
629+
return {"min": None, "max": None}
630+
try:
631+
return {"min": min(values), "max": max(values)}
632+
except TypeError:
633+
return {"min": None, "max": None}
634+
635+
def get_metadata_field_unique_values(
636+
self, metadata_field: str, search_term: str | None = None
637+
) -> tuple[list[str], int]:
638+
"""
639+
Returns unique values for a metadata field, optionally filtered by a search term in content.
640+
641+
:param metadata_field: The metadata field name. Can include or omit the "meta." prefix.
642+
:param search_term: If set, only documents whose content contains this term (case-insensitive)
643+
are considered.
644+
:returns: A tuple of (list of unique values, total count of unique values).
645+
"""
646+
key = metadata_field.removeprefix("meta.") if metadata_field.startswith("meta.") else metadata_field
647+
if search_term:
648+
docs = [doc for doc in self.storage.values() if doc.content and search_term.lower() in doc.content.lower()]
649+
else:
650+
docs = list(self.storage.values())
651+
values = sorted({str(doc.meta[key]) for doc in docs if key in doc.meta and doc.meta[key] is not None}, key=str)
652+
return values, len(values)
653+
554654
def bm25_retrieval(
555655
self, query: str, filters: dict[str, Any] | None = None, top_k: int = 10, scale_score: bool = False
556656
) -> list[Document]:
@@ -725,18 +825,16 @@ def _compute_query_embedding_similarity_scores(
725825

726826
async def count_documents_async(self) -> int:
727827
"""
728-
Returns the number of how many documents are present in the DocumentStore.
828+
Returns the number of documents present in the DocumentStore.
729829
"""
730830
return len(self.storage.keys())
731831

732832
async def filter_documents_async(self, filters: dict[str, Any] | None = None) -> list[Document]:
733833
"""
734834
Returns the documents that match the filters provided.
735835
736-
For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol
737-
documentation.
738-
739-
:param filters: The filters to apply to the document list.
836+
:param filters: The filters to apply. For a detailed specification of the filters, refer to the
837+
[documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering).
740838
:returns: A list of Documents that match the given filters.
741839
"""
742840
return await asyncio.get_running_loop().run_in_executor(
@@ -759,7 +857,7 @@ async def delete_documents_async(self, document_ids: list[str]) -> None:
759857
"""
760858
Deletes all documents with matching document_ids from the DocumentStore.
761859
762-
:param document_ids: The object_ids to delete.
860+
:param document_ids: The document_ids to delete.
763861
"""
764862
await asyncio.get_running_loop().run_in_executor(
765863
self.executor, lambda: self.delete_documents(document_ids=document_ids)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
features:
3+
- |
4+
Added new operations to the ``InMemoryDocumentStore``: count_documents_by_filter, count_unique_metadata_by_filter, get_metadata_fields_info, get_metadata_field_min_max, get_metadata_field_unique_values

test/document_stores/test_in_memory.py

Lines changed: 21 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,27 @@
1313
from haystack import Document
1414
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
1515
from haystack.document_stores.in_memory import InMemoryDocumentStore
16-
from haystack.testing.document_store import DocumentStoreBaseTests
17-
18-
19-
class TestMemoryDocumentStore(DocumentStoreBaseTests):
16+
from haystack.testing.document_store import (
17+
CountDocumentsByFilterTest,
18+
CountUniqueMetadataByFilterTest,
19+
DocumentStoreBaseExtendedTests,
20+
DocumentStoreBaseTests,
21+
FilterableDocsFixtureMixin,
22+
GetMetadataFieldMinMaxTest,
23+
GetMetadataFieldsInfoTest,
24+
GetMetadataFieldUniqueValuesTest,
25+
)
26+
27+
28+
class TestMemoryDocumentStore(
29+
DocumentStoreBaseExtendedTests,
30+
CountDocumentsByFilterTest,
31+
CountUniqueMetadataByFilterTest,
32+
FilterableDocsFixtureMixin,
33+
GetMetadataFieldMinMaxTest,
34+
GetMetadataFieldUniqueValuesTest,
35+
GetMetadataFieldsInfoTest,
36+
):
2037
"""
2138
Test InMemoryDocumentStore's specific features
2239
"""
@@ -109,88 +126,6 @@ def test_write_documents(self, document_store):
109126
with pytest.raises(DuplicateDocumentError):
110127
document_store.write_documents(docs)
111128

112-
def test_delete_all_documents(self, document_store: InMemoryDocumentStore):
113-
docs = [Document(content="Doc 1", meta={"category": "A"}), Document(content="Doc 2", meta={"category": "B"})]
114-
document_store.write_documents(docs)
115-
assert document_store.count_documents() == 2
116-
117-
document_store.delete_all_documents()
118-
assert document_store.count_documents() == 0
119-
assert document_store.filter_documents() == []
120-
121-
# Store remains functional after delete_all
122-
document_store.write_documents([Document(content="New doc")])
123-
assert document_store.count_documents() == 1
124-
125-
def test_delete_all_documents_empty_store(self, document_store: InMemoryDocumentStore):
126-
document_store.delete_all_documents()
127-
assert document_store.count_documents() == 0
128-
129-
def test_update_by_filter(self, document_store: InMemoryDocumentStore):
130-
docs = [
131-
Document(content="Doc 1", meta={"category": "A", "year": 2023}),
132-
Document(content="Doc 2", meta={"category": "B", "year": 2023}),
133-
Document(content="Doc 3", meta={"category": "A", "year": 2024}),
134-
]
135-
document_store.write_documents(docs)
136-
137-
updated = document_store.update_by_filter(
138-
filters={"field": "meta.category", "operator": "==", "value": "A"}, meta={"updated": True, "tag": "foo"}
139-
)
140-
assert updated == 2
141-
142-
all_docs = document_store.filter_documents()
143-
category_a = [d for d in all_docs if d.meta.get("category") == "A"]
144-
category_b = [d for d in all_docs if d.meta.get("category") == "B"]
145-
assert len(category_a) == 2
146-
assert all(d.meta.get("updated") is True and d.meta.get("tag") == "foo" for d in category_a)
147-
assert len(category_b) == 1
148-
assert "updated" not in category_b[0].meta and "tag" not in category_b[0].meta
149-
150-
def test_update_by_filter_no_matches(self, document_store: InMemoryDocumentStore):
151-
docs = [Document(content="Doc 1", meta={"category": "A"}), Document(content="Doc 2", meta={"category": "B"})]
152-
document_store.write_documents(docs)
153-
154-
updated = document_store.update_by_filter(
155-
filters={"field": "meta.category", "operator": "==", "value": "C"}, meta={"updated": True}
156-
)
157-
assert updated == 0
158-
assert document_store.count_documents() == 2
159-
160-
def test_delete_by_filter(self, document_store: InMemoryDocumentStore):
161-
docs = [
162-
Document(content="Doc 1", meta={"category": "A", "year": 2023}),
163-
Document(content="Doc 2", meta={"category": "B", "year": 2023}),
164-
Document(content="Doc 3", meta={"category": "A", "year": 2024}),
165-
]
166-
document_store.write_documents(docs)
167-
assert document_store.count_documents() == 3
168-
169-
deleted = document_store.delete_by_filter(filters={"field": "meta.category", "operator": "==", "value": "A"})
170-
assert deleted == 2
171-
assert document_store.count_documents() == 1
172-
remaining = document_store.filter_documents()
173-
assert remaining[0].meta["category"] == "B"
174-
175-
deleted = document_store.delete_by_filter(filters={"field": "meta.year", "operator": "==", "value": 2023})
176-
assert deleted == 1
177-
assert document_store.count_documents() == 0
178-
179-
def test_delete_by_filter_no_matches(self, document_store: InMemoryDocumentStore):
180-
docs = [Document(content="Doc 1", meta={"category": "A"}), Document(content="Doc 2", meta={"category": "B"})]
181-
document_store.write_documents(docs)
182-
183-
deleted = document_store.delete_by_filter(filters={"field": "meta.category", "operator": "==", "value": "C"})
184-
assert deleted == 0
185-
assert document_store.count_documents() == 2
186-
187-
def test_delete_by_filter_invalid_filters(self, document_store: InMemoryDocumentStore):
188-
document_store.write_documents([Document(content="Doc 1")])
189-
with pytest.raises(ValueError, match="Invalid filter syntax"):
190-
document_store.delete_by_filter(filters={"invalid": "filter"})
191-
with pytest.raises(ValueError, match="Invalid filter syntax"):
192-
document_store.update_by_filter(filters={"invalid": "filter"}, meta={"key": "value"})
193-
194129
def test_bm25_retrieval(self, document_store: InMemoryDocumentStore):
195130
# Tests if the bm25_retrieval method returns the correct document based on the input query.
196131
docs = [Document(content="Hello world"), Document(content="Haystack supports multiple languages")]

0 commit comments

Comments
 (0)