Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
ceea7b0
adding new Mixin tests
davidsbatista Mar 12, 2026
9b57bf0
adding more operations + using tests from Mixin
davidsbatista Mar 12, 2026
f2dcce7
reverting changes that were wrongly introduced from another branch
davidsbatista Mar 12, 2026
806d252
reverting changes that were wrongly introduced from another branch
davidsbatista Mar 12, 2026
7d556bd
updating tests
davidsbatista Mar 12, 2026
8ea51cd
adding release notes
davidsbatista Mar 12, 2026
f8ad9d0
Merge branch 'main' into feat/add-more-operations-to-in-memory-doc-store
davidsbatista Mar 16, 2026
03d0a79
Merge branch 'main' into feat/add-more-operations-to-in-memory-doc-store
davidsbatista Mar 17, 2026
8871b94
Merge branch 'main' into feat/add-more-operations-to-in-memory-doc-store
davidsbatista Mar 18, 2026
d80e605
Update haystack/document_stores/in_memory/document_store.py
davidsbatista Mar 18, 2026
9e0499f
Merge branch 'main' into feat/add-more-operations-to-in-memory-doc-store
davidsbatista Mar 18, 2026
dd9ca4f
Merge branch 'main' into feat/add-more-operations-to-in-memory-doc-store
davidsbatista Mar 18, 2026
1e1772c
adding more Mixin tests
davidsbatista Mar 18, 2026
4046e7c
Merge branch 'main' into feat/add-more-operations-to-in-memory-doc-store
davidsbatista Mar 18, 2026
d6d81a6
using inherentance class
davidsbatista Mar 18, 2026
5da06f8
fixes/improvements from PR comments
davidsbatista Mar 19, 2026
f1d962d
Merge branch 'main' into feat/add-more-operations-to-in-memory-doc-store
davidsbatista Mar 19, 2026
093ee17
fixing a few more docstrings
davidsbatista Mar 19, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 114 additions & 16 deletions haystack/document_stores/in_memory/document_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def _score_bm25okapi(self, query: str, documents: list[Document]) -> list[tuple[
The list of documents to score, should be produced by
the filter_documents method; may be an empty list.
:returns:
A list of tuples, each containing a Document and its BM25L score.
A list of tuples, each containing a Document and its BM25Okapi score.
"""
k = self.bm25_parameters.get("k1", 1.5)
b = self.bm25_parameters.get("b", 0.75)
Expand All @@ -274,7 +274,7 @@ def _compute_idf(tokens: list[str]) -> dict[str, float]:
return {tok: idf.get(tok, 0.0) for tok in tokens}

def _compute_tf(token: str, freq: dict[str, int], doc_len: int) -> float:
"""Per-token BM25L computation."""
"""Per-token BM25Okapi computation."""
freq_term = freq.get(token, 0.0)
freq_norm = freq_term + k * (1 - b + b * doc_len / self._avg_doc_len)
return freq_term * (1.0 + k) / freq_norm
Expand Down Expand Up @@ -376,7 +376,7 @@ def from_dict(cls, data: dict[str, Any]) -> "InMemoryDocumentStore":

def save_to_disk(self, path: str) -> None:
"""
Write the database and its' data to disk as a JSON file.
Write the database and its data to disk as a JSON file.

:param path: The path to the JSON file.
"""
Expand All @@ -388,7 +388,7 @@ def save_to_disk(self, path: str) -> None:
@classmethod
def load_from_disk(cls, path: str) -> "InMemoryDocumentStore":
"""
Load the database and its' data from disk as a JSON file.
Load the database and its data from disk as a JSON file.

:param path: The path to the JSON file.
:returns: The loaded InMemoryDocumentStore.
Expand All @@ -411,18 +411,16 @@ def load_from_disk(cls, path: str) -> "InMemoryDocumentStore":

def count_documents(self) -> int:
"""
Returns the number of how many documents are present in the DocumentStore.
Returns the number of documents present in the DocumentStore.
"""
return len(self.storage.keys())

def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Document]:
"""
Returns the documents that match the filters provided.

For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol
documentation.

:param filters: The filters to apply to the document list.
:param filters: The filters to apply. For a detailed specification of the filters, refer to the
[documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering).
:returns: A list of Documents that match the given filters.
"""
if filters:
Expand Down Expand Up @@ -485,7 +483,7 @@ def delete_documents(self, document_ids: list[str]) -> None:
"""
Deletes all documents with matching document_ids from the DocumentStore.

:param document_ids: The object_ids to delete.
:param document_ids: The document_ids to delete.
"""
for doc_id in document_ids:
if doc_id not in self.storage.keys():
Expand Down Expand Up @@ -551,6 +549,108 @@ def delete_by_filter(self, filters: dict[str, Any]) -> int:
self.delete_documents(doc_ids)
return len(doc_ids)

def count_documents_by_filter(self, filters: dict[str, Any]) -> int:
"""
Returns the number of documents that match the provided filters.

:param filters: The filters to apply.
For a detailed specification of the filters, refer to the
[documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering).
:returns: The number of documents that match the filters.
"""
if filters:
InMemoryDocumentStore._validate_filters(filters)
return sum(1 for doc in self.storage.values() if document_matches_filter(filters=filters, document=doc))
return len(self.storage)

def count_unique_metadata_by_filter(self, filters: dict[str, Any], metadata_fields: list[str]) -> dict[str, int]:
"""
Returns the number of unique values for each specified metadata field from documents matching the filters.

:param filters: The filters to apply.
For a detailed specification of the filters, refer to the
[documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering).
:param metadata_fields: List of field names to count unique values for.
Field names can include or omit the "meta." prefix.
:returns: A dictionary mapping each metadata field name (without "meta." prefix)
to the count of its unique values among the filtered documents.
"""
if filters:
InMemoryDocumentStore._validate_filters(filters)
docs = [doc for doc in self.storage.values() if document_matches_filter(filters=filters, document=doc)]
else:
docs = list(self.storage.values())

result: dict[str, int] = {}
for field in metadata_fields:
key = field.removeprefix("meta.") if field.startswith("meta.") else field
values = {doc.meta.get(key) for doc in docs if key in doc.meta and doc.meta[key] is not None}
result[key] = len(values)
return result

def get_metadata_fields_info(self) -> dict[str, dict[str, str]]:
"""
Returns information about the metadata fields present in the stored documents.

Types are inferred from the stored values (keyword, int, float, boolean).

:returns: A dictionary mapping each metadata field name to a dict with a "type" key.
"""
type_map: dict[str, str] = {}
for doc in self.storage.values():
for key, value in doc.meta.items():
if value is None:
continue
if isinstance(value, bool):
type_map[key] = "boolean"
elif isinstance(value, int):
type_map[key] = "int"
elif isinstance(value, float):
type_map[key] = "float"
else:
type_map[key] = "keyword"
return {k: {"type": v} for k, v in type_map.items()}

def get_metadata_field_min_max(self, metadata_field: str) -> dict[str, Any]:
"""
Returns the minimum and maximum values for the given metadata field across all documents.

:param metadata_field: The metadata field name. Can include or omit the "meta." prefix.
:returns: A dictionary with "min" and "max" keys. Returns `{"min": None, "max": None}`
if the field is missing or has no values.
"""
key = metadata_field.removeprefix("meta.") if metadata_field.startswith("meta.") else metadata_field
values = [
doc.meta[key]
for doc in self.storage.values()
if key in doc.meta and doc.meta[key] is not None and isinstance(doc.meta[key], (int, float, str))
]
if not values:
return {"min": None, "max": None}
try:
return {"min": min(values), "max": max(values)}
except TypeError:
return {"min": None, "max": None}

def get_metadata_field_unique_values(
self, metadata_field: str, search_term: str | None = None
) -> tuple[list[str], int]:
"""
Returns unique values for a metadata field, optionally filtered by a search term in content.

:param metadata_field: The metadata field name. Can include or omit the "meta." prefix.
:param search_term: If set, only documents whose content contains this term (case-insensitive)
are considered.
:returns: A tuple of (list of unique values, total count of unique values).
"""
key = metadata_field.removeprefix("meta.") if metadata_field.startswith("meta.") else metadata_field
if search_term:
docs = [doc for doc in self.storage.values() if doc.content and search_term.lower() in doc.content.lower()]
else:
docs = list(self.storage.values())
values = sorted({str(doc.meta[key]) for doc in docs if key in doc.meta and doc.meta[key] is not None}, key=str)
return values, len(values)

def bm25_retrieval(
self, query: str, filters: dict[str, Any] | None = None, top_k: int = 10, scale_score: bool = False
) -> list[Document]:
Expand Down Expand Up @@ -725,18 +825,16 @@ def _compute_query_embedding_similarity_scores(

async def count_documents_async(self) -> int:
"""
Returns the number of how many documents are present in the DocumentStore.
Returns the number of documents present in the DocumentStore.
"""
return len(self.storage.keys())

async def filter_documents_async(self, filters: dict[str, Any] | None = None) -> list[Document]:
"""
Returns the documents that match the filters provided.

For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol
documentation.

:param filters: The filters to apply to the document list.
:param filters: The filters to apply. For a detailed specification of the filters, refer to the
[documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering).
:returns: A list of Documents that match the given filters.
"""
return await asyncio.get_running_loop().run_in_executor(
Expand All @@ -759,7 +857,7 @@ async def delete_documents_async(self, document_ids: list[str]) -> None:
"""
Deletes all documents with matching document_ids from the DocumentStore.

:param document_ids: The object_ids to delete.
:param document_ids: The document_ids to delete.
"""
await asyncio.get_running_loop().run_in_executor(
self.executor, lambda: self.delete_documents(document_ids=document_ids)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
Added new operations to the ``InMemoryDocumentStore``: count_documents_by_filter, count_unique_metadata_by_filter, get_metadata_fields_info, get_metadata_field_min_max, get_metadata_field_unique_values
107 changes: 21 additions & 86 deletions test/document_stores/test_in_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,27 @@
from haystack import Document
from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.testing.document_store import DocumentStoreBaseTests


class TestMemoryDocumentStore(DocumentStoreBaseTests):
from haystack.testing.document_store import (
CountDocumentsByFilterTest,
CountUniqueMetadataByFilterTest,
DocumentStoreBaseExtendedTests,
DocumentStoreBaseTests,
FilterableDocsFixtureMixin,
GetMetadataFieldMinMaxTest,
GetMetadataFieldsInfoTest,
GetMetadataFieldUniqueValuesTest,
)


class TestMemoryDocumentStore(
DocumentStoreBaseExtendedTests,
CountDocumentsByFilterTest,
CountUniqueMetadataByFilterTest,
FilterableDocsFixtureMixin,
GetMetadataFieldMinMaxTest,
GetMetadataFieldUniqueValuesTest,
GetMetadataFieldsInfoTest,
):
"""
Test InMemoryDocumentStore's specific features
"""
Expand Down Expand Up @@ -109,88 +126,6 @@ def test_write_documents(self, document_store):
with pytest.raises(DuplicateDocumentError):
document_store.write_documents(docs)

def test_delete_all_documents(self, document_store: InMemoryDocumentStore):
docs = [Document(content="Doc 1", meta={"category": "A"}), Document(content="Doc 2", meta={"category": "B"})]
document_store.write_documents(docs)
assert document_store.count_documents() == 2

document_store.delete_all_documents()
assert document_store.count_documents() == 0
assert document_store.filter_documents() == []

# Store remains functional after delete_all
document_store.write_documents([Document(content="New doc")])
assert document_store.count_documents() == 1

def test_delete_all_documents_empty_store(self, document_store: InMemoryDocumentStore):
document_store.delete_all_documents()
assert document_store.count_documents() == 0

def test_update_by_filter(self, document_store: InMemoryDocumentStore):
docs = [
Document(content="Doc 1", meta={"category": "A", "year": 2023}),
Document(content="Doc 2", meta={"category": "B", "year": 2023}),
Document(content="Doc 3", meta={"category": "A", "year": 2024}),
]
document_store.write_documents(docs)

updated = document_store.update_by_filter(
filters={"field": "meta.category", "operator": "==", "value": "A"}, meta={"updated": True, "tag": "foo"}
)
assert updated == 2

all_docs = document_store.filter_documents()
category_a = [d for d in all_docs if d.meta.get("category") == "A"]
category_b = [d for d in all_docs if d.meta.get("category") == "B"]
assert len(category_a) == 2
assert all(d.meta.get("updated") is True and d.meta.get("tag") == "foo" for d in category_a)
assert len(category_b) == 1
assert "updated" not in category_b[0].meta and "tag" not in category_b[0].meta

def test_update_by_filter_no_matches(self, document_store: InMemoryDocumentStore):
docs = [Document(content="Doc 1", meta={"category": "A"}), Document(content="Doc 2", meta={"category": "B"})]
document_store.write_documents(docs)

updated = document_store.update_by_filter(
filters={"field": "meta.category", "operator": "==", "value": "C"}, meta={"updated": True}
)
assert updated == 0
assert document_store.count_documents() == 2

def test_delete_by_filter(self, document_store: InMemoryDocumentStore):
docs = [
Document(content="Doc 1", meta={"category": "A", "year": 2023}),
Document(content="Doc 2", meta={"category": "B", "year": 2023}),
Document(content="Doc 3", meta={"category": "A", "year": 2024}),
]
document_store.write_documents(docs)
assert document_store.count_documents() == 3

deleted = document_store.delete_by_filter(filters={"field": "meta.category", "operator": "==", "value": "A"})
assert deleted == 2
assert document_store.count_documents() == 1
remaining = document_store.filter_documents()
assert remaining[0].meta["category"] == "B"

deleted = document_store.delete_by_filter(filters={"field": "meta.year", "operator": "==", "value": 2023})
assert deleted == 1
assert document_store.count_documents() == 0

def test_delete_by_filter_no_matches(self, document_store: InMemoryDocumentStore):
docs = [Document(content="Doc 1", meta={"category": "A"}), Document(content="Doc 2", meta={"category": "B"})]
document_store.write_documents(docs)

deleted = document_store.delete_by_filter(filters={"field": "meta.category", "operator": "==", "value": "C"})
assert deleted == 0
assert document_store.count_documents() == 2

def test_delete_by_filter_invalid_filters(self, document_store: InMemoryDocumentStore):
document_store.write_documents([Document(content="Doc 1")])
with pytest.raises(ValueError, match="Invalid filter syntax"):
document_store.delete_by_filter(filters={"invalid": "filter"})
with pytest.raises(ValueError, match="Invalid filter syntax"):
document_store.update_by_filter(filters={"invalid": "filter"}, meta={"key": "value"})

def test_bm25_retrieval(self, document_store: InMemoryDocumentStore):
# Tests if the bm25_retrieval method returns the correct document based on the input query.
docs = [Document(content="Hello world"), Document(content="Haystack supports multiple languages")]
Expand Down
Loading