1+ # SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
2+ #
3+ # SPDX-License-Identifier: Apache-2.0
4+
15import json
26import logging
37from collections import defaultdict
@@ -60,9 +64,8 @@ def _create_new_index(self):
6064 base_index = faiss .index_factory (self .embedding_dim , self .index_string )
6165 self .index = faiss .IndexIDMap (base_index )
6266 except RuntimeError as e :
63- raise DocumentStoreError (
64- f"Could not create FAISS index with factory string '{ self .index_string } ': { e } "
65- ) from e
67+ error_msg = f"Could not create FAISS index with factory string '{ self .index_string } ': { e } "
68+ raise DocumentStoreError (error_msg ) from e
6669
6770 def count_documents (self ) -> int :
6871 """
@@ -89,8 +92,10 @@ def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Docume
8992 def _matches_filters (self , doc : Document , filters : dict [str , Any ]) -> bool :
9093 """
9194 Checks if a document matches the given filters.
92- Currently supports simple equality check for 'field' == 'value', and logical operators AND/OR/NOT are NOT fully implemented in this MVP helper.
93- Wait, Haystack 2.x filters are complex. We should use a proper filter parser or a simple recusive check if we want to support full syntax.
95+ Currently supports simple equality check for 'field' == 'value'.
96+ Logical operators AND/OR/NOT are NOT fully implemented in this MVP helper.
97+ Wait, Haystack 2.x filters are complex. We should use a proper filter parser
98+ or a simple recusive check if we want to support full syntax.
9499 For MVP, let's implement basic filtering logic.
95100 """
96101 if "operator" not in filters :
@@ -130,7 +135,8 @@ def write_documents(self, documents: list[Document], policy: DuplicatePolicy = D
130135 if policy == DuplicatePolicy .FAIL :
131136 for doc in documents :
132137 if doc .id in self .documents :
133- raise DuplicateDocumentError (f"Document with id '{ doc .id } ' already exists." )
138+ msg = f"Document with id '{ doc .id } ' already exists."
139+ raise DuplicateDocumentError (msg )
134140
135141 # Process documents
136142 ids_to_add_to_index = []
@@ -223,8 +229,8 @@ def search(
223229
224230 # Search in FAISS
225231 # Valid strategy for pre-filtering vs post-filtering:
226- # Since FAISS `IndexIDMap` doesn't support pre-filtering natively comfortably without `RangeSearch` or specialized impls,
227- # we usually fetch more (k * scale_factor) and filter post-retrieval.
232+ # Since FAISS `IndexIDMap` doesn't support pre-filtering natively comfortably
233+ # without `RangeSearch` or specialized impls, we fetch more and filter post-retrieval.
228234
229235 fetch_k = top_k
230236 if filters :
@@ -233,7 +239,7 @@ def search(
233239 distances , indices = self .index .search (query_vec , fetch_k )
234240
235241 results = []
236- for dist , int_id in zip (distances [0 ], indices [0 ]):
242+ for dist , int_id in zip (distances [0 ], indices [0 ], strict = False ):
237243 if int_id == - 1 :
238244 continue
239245
@@ -262,40 +268,42 @@ def search(
262268
263269 return results
264270
265- def _get_result_to_documents (self , result ) -> list [Document ]:
266- # Compatibility/Helper if matching Chroma approach
267- return []
268-
269271 def _check_condition (self , doc : Document , condition : dict [str , Any ]) -> bool :
270272 if "operator" not in condition and "conditions" not in condition :
271273 # This might be a legacy or malformed filter from tests like test_missing_top_level_operator_key
272274 # The standard Haystack filter structure enforces keys.
273275 # On failure to parse standard structure, we should raise FilterError as per tests?
274276 # Actually, looking at the tests (e.g. TestFAISSDocumentStore.test_missing_top_level_operator_key),
275277 # they expect FilterError if "operator" is missing from a condition block.
276- raise FilterError ("Filter condition missing 'operator'" )
278+ msg = "Filter condition missing 'operator'"
279+ raise FilterError (msg )
277280
278281 operator = condition .get ("operator" , "==" )
279282
280283 if operator == "AND" :
281284 if "conditions" not in condition :
282- raise FilterError ("Missing 'conditions' for AND operator" )
285+ msg = "Missing 'conditions' for AND operator"
286+ raise FilterError (msg )
283287 return all (self ._check_condition (doc , cond ) for cond in condition .get ("conditions" , []))
284288 elif operator == "OR" :
285289 if "conditions" not in condition :
286- raise FilterError ("Missing 'conditions' for OR operator" )
290+ msg = "Missing 'conditions' for OR operator"
291+ raise FilterError (msg )
287292 return any (self ._check_condition (doc , cond ) for cond in condition .get ("conditions" , []))
288293 elif operator == "NOT" :
289294 if "conditions" not in condition :
290- raise FilterError ("Missing 'conditions' for NOT operator" )
295+ msg = "Missing 'conditions' for NOT operator"
296+ raise FilterError (msg )
291297 return not self ._check_condition (doc , condition .get ("conditions" , [])[0 ])
292298
293299 # Leaf condition
294300 if "field" not in condition :
295- raise FilterError ("Missing 'field' in filter condition" )
301+ msg = "Missing 'field' in filter condition"
302+ raise FilterError (msg )
296303 field = condition .get ("field" )
297304 if "value" not in condition :
298- raise FilterError ("Missing 'value' in filter condition" )
305+ msg = "Missing 'value' in filter condition"
306+ raise FilterError (msg )
299307 value = condition .get ("value" )
300308
301309 doc_val = self ._get_doc_value (doc , field )
@@ -307,7 +315,8 @@ def _check_condition(self, doc: Document, condition: dict[str, Any]) -> bool:
307315 return False
308316
309317 if value is None :
310- # Comparing anything with None using inequalities is invalid, but tests expect efficient handling (no match)
318+ # Comparing anything with None using inequalities is invalid,
319+ # but tests expect efficient handling (no match)
311320 return False
312321
313322 # Check for compatibility
@@ -318,9 +327,10 @@ def _check_condition(self, doc: Document, condition: dict[str, Any]) -> bool:
318327 if is_number_doc and is_number_val :
319328 # Compatible
320329 pass
321- elif type (doc_val ) != type (value ):
330+ elif type (doc_val ) is not type (value ):
322331 # Incompatible types for inequality implementation (like str vs int, or list vs int)
323- raise FilterError (f"Type mismatch: cannot compare { type (doc_val )} with { type (value )} " )
332+ msg = f"Type mismatch: cannot compare { type (doc_val )} with { type (value )} "
333+ raise FilterError (msg )
324334
325335 try :
326336 if operator == ">" :
@@ -332,19 +342,22 @@ def _check_condition(self, doc: Document, condition: dict[str, Any]) -> bool:
332342 if operator == "<=" :
333343 return doc_val <= value
334344 except TypeError as e :
335- raise FilterError (f"Type mismatch in filter: { e } " ) from e
345+ msg = f"Type mismatch in filter: { e } "
346+ raise FilterError (msg ) from e
336347
337348 if operator == "==" :
338349 return doc_val == value
339350 elif operator == "!=" :
340351 return doc_val != value
341352 elif operator == "in" :
342353 if not isinstance (value , list ):
343- raise FilterError ("Value for 'in' must be a list" )
354+ msg = "Value for 'in' must be a list"
355+ raise FilterError (msg )
344356 return doc_val in value
345357 elif operator == "not in" :
346358 if not isinstance (value , list ):
347- raise FilterError ("Value for 'not in' must be a list" )
359+ msg = "Value for 'not in' must be a list"
360+ raise FilterError (msg )
348361 return doc_val not in value
349362
350363 return False
@@ -361,6 +374,16 @@ def count_documents_by_filter(self, filters: dict[str, Any]) -> int:
361374 return len (self .filter_documents (filters ))
362375
363376 def update_by_filter (self , filters : dict [str , Any ], meta : dict [str , Any ]) -> int :
377+ """
378+ Updates documents that match the provided filters with the new metadata.
379+
380+ Note: Updates are performed in-memory only. To persist these changes,
381+ you must explicitly call `save()` after updating.
382+
383+ :param filters: A dictionary of filters to apply to find documents to update.
384+ :param meta: A dictionary of metadata key-value pairs to update in the matching documents.
385+ :returns: The number of documents updated.
386+ """
364387 docs_to_update = self .filter_documents (filters )
365388 for doc in docs_to_update :
366389 doc .meta .update (meta )
@@ -384,6 +407,12 @@ def get_metadata_fields_info(self) -> dict[str, dict[str, str]]:
384407 return fields_idx
385408
386409 def get_metadata_field_min_max (self , field_name : str ) -> dict [str , Any ]:
410+ """
411+ Returns the minimum and maximum values for a specific metadata field.
412+
413+ :param field_name: The name of the metadata field.
414+ :returns: A dictionary with keys "min" and "max" containing the respective min and max values.
415+ """
387416 values = []
388417 for doc in self .documents .values ():
389418 val = (
@@ -400,6 +429,12 @@ def get_metadata_field_min_max(self, field_name: str) -> dict[str, Any]:
400429 return {"min" : min (values ), "max" : max (values )}
401430
402431 def get_metadata_field_unique_values (self , field_name : str ) -> list [Any ]:
432+ """
433+ Returns all unique values for a specific metadata field.
434+
435+ :param field_name: The name of the metadata field.
436+ :returns: A list of unique values for the specified field.
437+ """
403438 values = set ()
404439 for doc in self .documents .values ():
405440 val = (
@@ -412,6 +447,13 @@ def get_metadata_field_unique_values(self, field_name: str) -> list[Any]:
412447 return list (values )
413448
414449 def count_unique_metadata_by_filter (self , filters : dict [str , Any ], fields : list [str ]) -> dict [str , int ]:
450+ """
451+ Returns a count of unique values for multiple metadata fields, optionally scoped by a filter.
452+
453+ :param filters: A dictionary of filters to apply.
454+ :param fields: A list of metadata field names to count unique values for.
455+ :returns: A dictionary mapping each field name to the count of its unique values.
456+ """
415457 filtered_docs = self .filter_documents (filters )
416458 counts = defaultdict (int )
417459 # Wait, the return type is Dict[str, int] mapping field -> unique_count?
@@ -469,7 +511,8 @@ def load(self, index_path: str | Path) -> None:
469511 """
470512 path = Path (index_path )
471513 if not path .with_suffix (".faiss" ).exists ():
472- raise ValueError (f"File not found: { path .with_suffix ('.faiss' )} " )
514+ msg = f"File not found: { path .with_suffix ('.faiss' )} "
515+ raise ValueError (msg )
473516
474517 self .index = faiss .read_index (str (path .with_suffix (".faiss" )))
475518
@@ -485,5 +528,6 @@ def load(self, index_path: str | Path) -> None:
485528 # Verify sync
486529 if len (self .documents ) != len (self .id_map ):
487530 logger .warning (
488- f"Loaded { len (self .documents )} documents but { len (self .id_map )} ID mappings. Index might be out of sync."
531+ "Loaded %d documents but %d ID mappings. Index might be out of sync." ,
532+ len (self .documents ), len (self .id_map )
489533 )
0 commit comments