@@ -247,7 +247,7 @@ def _score_bm25okapi(self, query: str, documents: list[Document]) -> list[tuple[
247247 The list of documents to score, should be produced by
248248 the filter_documents method; may be an empty list.
249249 :returns:
250- A list of tuples, each containing a Document and its BM25L score.
250+ A list of tuples, each containing a Document and its BM25Okapi score.
251251 """
252252 k = self .bm25_parameters .get ("k1" , 1.5 )
253253 b = self .bm25_parameters .get ("b" , 0.75 )
@@ -274,7 +274,7 @@ def _compute_idf(tokens: list[str]) -> dict[str, float]:
274274 return {tok : idf .get (tok , 0.0 ) for tok in tokens }
275275
276276 def _compute_tf (token : str , freq : dict [str , int ], doc_len : int ) -> float :
277- """Per-token BM25L computation."""
277+ """Per-token BM25Okapi computation."""
278278 freq_term = freq .get (token , 0.0 )
279279 freq_norm = freq_term + k * (1 - b + b * doc_len / self ._avg_doc_len )
280280 return freq_term * (1.0 + k ) / freq_norm
@@ -376,7 +376,7 @@ def from_dict(cls, data: dict[str, Any]) -> "InMemoryDocumentStore":
376376
377377 def save_to_disk (self , path : str ) -> None :
378378 """
379- Write the database and its' data to disk as a JSON file.
379+ Write the database and its data to disk as a JSON file.
380380
381381 :param path: The path to the JSON file.
382382 """
@@ -388,7 +388,7 @@ def save_to_disk(self, path: str) -> None:
388388 @classmethod
389389 def load_from_disk (cls , path : str ) -> "InMemoryDocumentStore" :
390390 """
391- Load the database and its' data from disk as a JSON file.
391+ Load the database and its data from disk as a JSON file.
392392
393393 :param path: The path to the JSON file.
394394 :returns: The loaded InMemoryDocumentStore.
@@ -411,18 +411,16 @@ def load_from_disk(cls, path: str) -> "InMemoryDocumentStore":
411411
412412 def count_documents (self ) -> int :
413413 """
414- Returns the number of how many documents are present in the DocumentStore.
414+ Returns the number of documents present in the DocumentStore.
415415 """
416416 return len (self .storage .keys ())
417417
418418 def filter_documents (self , filters : dict [str , Any ] | None = None ) -> list [Document ]:
419419 """
420420 Returns the documents that match the filters provided.
421421
422- For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol
423- documentation.
424-
425- :param filters: The filters to apply to the document list.
422+ :param filters: The filters to apply. For a detailed specification of the filters, refer to the
423+ [documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering).
426424 :returns: A list of Documents that match the given filters.
427425 """
428426 if filters :
@@ -485,7 +483,7 @@ def delete_documents(self, document_ids: list[str]) -> None:
485483 """
486484 Deletes all documents with matching document_ids from the DocumentStore.
487485
488- :param document_ids: The object_ids to delete.
486+ :param document_ids: The document_ids to delete.
489487 """
490488 for doc_id in document_ids :
491489 if doc_id not in self .storage .keys ():
@@ -551,6 +549,108 @@ def delete_by_filter(self, filters: dict[str, Any]) -> int:
551549 self .delete_documents (doc_ids )
552550 return len (doc_ids )
553551
552+ def count_documents_by_filter (self , filters : dict [str , Any ]) -> int :
553+ """
554+ Returns the number of documents that match the provided filters.
555+
556+ :param filters: The filters to apply.
557+ For a detailed specification of the filters, refer to the
558+ [documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering).
559+ :returns: The number of documents that match the filters.
560+ """
561+ if filters :
562+ InMemoryDocumentStore ._validate_filters (filters )
563+ return sum (1 for doc in self .storage .values () if document_matches_filter (filters = filters , document = doc ))
564+ return len (self .storage )
565+
566+ def count_unique_metadata_by_filter (self , filters : dict [str , Any ], metadata_fields : list [str ]) -> dict [str , int ]:
567+ """
568+ Returns the number of unique values for each specified metadata field from documents matching the filters.
569+
570+ :param filters: The filters to apply.
571+ For a detailed specification of the filters, refer to the
572+ [documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering).
573+ :param metadata_fields: List of field names to count unique values for.
574+ Field names can include or omit the "meta." prefix.
575+ :returns: A dictionary mapping each metadata field name (without "meta." prefix)
576+ to the count of its unique values among the filtered documents.
577+ """
578+ if filters :
579+ InMemoryDocumentStore ._validate_filters (filters )
580+ docs = [doc for doc in self .storage .values () if document_matches_filter (filters = filters , document = doc )]
581+ else :
582+ docs = list (self .storage .values ())
583+
584+ result : dict [str , int ] = {}
585+ for field in metadata_fields :
586+ key = field .removeprefix ("meta." ) if field .startswith ("meta." ) else field
587+ values = {doc .meta .get (key ) for doc in docs if key in doc .meta and doc .meta [key ] is not None }
588+ result [key ] = len (values )
589+ return result
590+
591+ def get_metadata_fields_info (self ) -> dict [str , dict [str , str ]]:
592+ """
593+ Returns information about the metadata fields present in the stored documents.
594+
595+ Types are inferred from the stored values (keyword, int, float, boolean).
596+
597+ :returns: A dictionary mapping each metadata field name to a dict with a "type" key.
598+ """
599+ type_map : dict [str , str ] = {}
600+ for doc in self .storage .values ():
601+ for key , value in doc .meta .items ():
602+ if value is None :
603+ continue
604+ if isinstance (value , bool ):
605+ type_map [key ] = "boolean"
606+ elif isinstance (value , int ):
607+ type_map [key ] = "int"
608+ elif isinstance (value , float ):
609+ type_map [key ] = "float"
610+ else :
611+ type_map [key ] = "keyword"
612+ return {k : {"type" : v } for k , v in type_map .items ()}
613+
614+ def get_metadata_field_min_max (self , metadata_field : str ) -> dict [str , Any ]:
615+ """
616+ Returns the minimum and maximum values for the given metadata field across all documents.
617+
618+ :param metadata_field: The metadata field name. Can include or omit the "meta." prefix.
619+ :returns: A dictionary with "min" and "max" keys. Returns `{"min": None, "max": None}`
620+ if the field is missing or has no values.
621+ """
622+ key = metadata_field .removeprefix ("meta." ) if metadata_field .startswith ("meta." ) else metadata_field
623+ values = [
624+ doc .meta [key ]
625+ for doc in self .storage .values ()
626+ if key in doc .meta and doc .meta [key ] is not None and isinstance (doc .meta [key ], (int , float , str ))
627+ ]
628+ if not values :
629+ return {"min" : None , "max" : None }
630+ try :
631+ return {"min" : min (values ), "max" : max (values )}
632+ except TypeError :
633+ return {"min" : None , "max" : None }
634+
635+ def get_metadata_field_unique_values (
636+ self , metadata_field : str , search_term : str | None = None
637+ ) -> tuple [list [str ], int ]:
638+ """
639+ Returns unique values for a metadata field, optionally filtered by a search term in content.
640+
641+ :param metadata_field: The metadata field name. Can include or omit the "meta." prefix.
642+ :param search_term: If set, only documents whose content contains this term (case-insensitive)
643+ are considered.
644+ :returns: A tuple of (list of unique values, total count of unique values).
645+ """
646+ key = metadata_field .removeprefix ("meta." ) if metadata_field .startswith ("meta." ) else metadata_field
647+ if search_term :
648+ docs = [doc for doc in self .storage .values () if doc .content and search_term .lower () in doc .content .lower ()]
649+ else :
650+ docs = list (self .storage .values ())
651+ values = sorted ({str (doc .meta [key ]) for doc in docs if key in doc .meta and doc .meta [key ] is not None }, key = str )
652+ return values , len (values )
653+
554654 def bm25_retrieval (
555655 self , query : str , filters : dict [str , Any ] | None = None , top_k : int = 10 , scale_score : bool = False
556656 ) -> list [Document ]:
@@ -725,18 +825,16 @@ def _compute_query_embedding_similarity_scores(
725825
726826 async def count_documents_async (self ) -> int :
727827 """
728- Returns the number of how many documents are present in the DocumentStore.
828+ Returns the number of documents present in the DocumentStore.
729829 """
730830 return len (self .storage .keys ())
731831
732832 async def filter_documents_async (self , filters : dict [str , Any ] | None = None ) -> list [Document ]:
733833 """
734834 Returns the documents that match the filters provided.
735835
736- For a detailed specification of the filters, refer to the DocumentStore.filter_documents() protocol
737- documentation.
738-
739- :param filters: The filters to apply to the document list.
836+ :param filters: The filters to apply. For a detailed specification of the filters, refer to the
837+ [documentation](https://docs.haystack.deepset.ai/docs/metadata-filtering).
740838 :returns: A list of Documents that match the given filters.
741839 """
742840 return await asyncio .get_running_loop ().run_in_executor (
@@ -759,7 +857,7 @@ async def delete_documents_async(self, document_ids: list[str]) -> None:
759857 """
760858 Deletes all documents with matching document_ids from the DocumentStore.
761859
762- :param document_ids: The object_ids to delete.
860+ :param document_ids: The document_ids to delete.
763861 """
764862 await asyncio .get_running_loop ().run_in_executor (
765863 self .executor , lambda : self .delete_documents (document_ids = document_ids )
0 commit comments