fix: resolve lint errors, add py.typed, and clean up test mixins

GunaPalanivel · GunaPalanivel · commit 0451da9591dd · 2026-02-24T11:39:24.000+05:30
- Fixed Ruff E501, EM101, EM102, B905, E721, and PLC0415
- Added empty py.typed marker to src package
- Removed redundant FilterableDocsFixtureMixin from TestFAISSDocumentStore
diff --git a/integrations/faiss/src/haystack_integrations/document_stores/faiss/document_store.py b/integrations/faiss/src/haystack_integrations/document_stores/faiss/document_store.py
@@ -1,3 +1,7 @@
+# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
+#
+# SPDX-License-Identifier: Apache-2.0
+
 import json
 import logging
 from collections import defaultdict
@@ -60,9 +64,8 @@ def _create_new_index(self):
             base_index = faiss.index_factory(self.embedding_dim, self.index_string)
             self.index = faiss.IndexIDMap(base_index)
         except RuntimeError as e:
-            raise DocumentStoreError(
-                f"Could not create FAISS index with factory string '{self.index_string}': {e}"
-            ) from e
+            error_msg = f"Could not create FAISS index with factory string '{self.index_string}': {e}"
+            raise DocumentStoreError(error_msg) from e
 
     def count_documents(self) -> int:
         """
@@ -89,8 +92,10 @@ def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Docume
     def _matches_filters(self, doc: Document, filters: dict[str, Any]) -> bool:
         """
         Checks if a document matches the given filters.
-        Currently supports simple equality check for 'field' == 'value', and logical operators AND/OR/NOT are NOT fully implemented in this MVP helper.
-        Wait, Haystack 2.x filters are complex. We should use a proper filter parser or a simple recusive check if we want to support full syntax.
+        Currently supports simple equality check for 'field' == 'value'.
+        Logical operators AND/OR/NOT are NOT fully implemented in this MVP helper.
+        Wait, Haystack 2.x filters are complex. We should use a proper filter parser
+        or a simple recusive check if we want to support full syntax.
         For MVP, let's implement basic filtering logic.
         """
         if "operator" not in filters:
@@ -130,7 +135,8 @@ def write_documents(self, documents: list[Document], policy: DuplicatePolicy = D
         if policy == DuplicatePolicy.FAIL:
             for doc in documents:
                 if doc.id in self.documents:
-                    raise DuplicateDocumentError(f"Document with id '{doc.id}' already exists.")
+                    msg = f"Document with id '{doc.id}' already exists."
+                    raise DuplicateDocumentError(msg)
 
         # Process documents
         ids_to_add_to_index = []
@@ -223,8 +229,8 @@ def search(
 
         # Search in FAISS
         # Valid strategy for pre-filtering vs post-filtering:
-        # Since FAISS `IndexIDMap` doesn't support pre-filtering natively comfortably without `RangeSearch` or specialized impls,
-        # we usually fetch more (k * scale_factor) and filter post-retrieval.
+        # Since FAISS `IndexIDMap` doesn't support pre-filtering natively comfortably
+        # without `RangeSearch` or specialized impls, we fetch more and filter post-retrieval.
 
         fetch_k = top_k
         if filters:
@@ -233,7 +239,7 @@ def search(
         distances, indices = self.index.search(query_vec, fetch_k)
 
         results = []
-        for dist, int_id in zip(distances[0], indices[0]):
+        for dist, int_id in zip(distances[0], indices[0], strict=False):
             if int_id == -1:
                 continue
 
@@ -262,40 +268,42 @@ def search(
 
         return results
 
-    def _get_result_to_documents(self, result) -> list[Document]:
-        # Compatibility/Helper if matching Chroma approach
-        return []
-
     def _check_condition(self, doc: Document, condition: dict[str, Any]) -> bool:
         if "operator" not in condition and "conditions" not in condition:
             # This might be a legacy or malformed filter from tests like test_missing_top_level_operator_key
             # The standard Haystack filter structure enforces keys.
             # On failure to parse standard structure, we should raise FilterError as per tests?
             # Actually, looking at the tests (e.g. TestFAISSDocumentStore.test_missing_top_level_operator_key),
             # they expect FilterError if "operator" is missing from a condition block.
-            raise FilterError("Filter condition missing 'operator'")
+            msg = "Filter condition missing 'operator'"
+            raise FilterError(msg)
 
         operator = condition.get("operator", "==")
 
         if operator == "AND":
             if "conditions" not in condition:
-                raise FilterError("Missing 'conditions' for AND operator")
+                msg = "Missing 'conditions' for AND operator"
+                raise FilterError(msg)
             return all(self._check_condition(doc, cond) for cond in condition.get("conditions", []))
         elif operator == "OR":
             if "conditions" not in condition:
-                raise FilterError("Missing 'conditions' for OR operator")
+                msg = "Missing 'conditions' for OR operator"
+                raise FilterError(msg)
             return any(self._check_condition(doc, cond) for cond in condition.get("conditions", []))
         elif operator == "NOT":
             if "conditions" not in condition:
-                raise FilterError("Missing 'conditions' for NOT operator")
+                msg = "Missing 'conditions' for NOT operator"
+                raise FilterError(msg)
             return not self._check_condition(doc, condition.get("conditions", [])[0])
 
         # Leaf condition
         if "field" not in condition:
-            raise FilterError("Missing 'field' in filter condition")
+            msg = "Missing 'field' in filter condition"
+            raise FilterError(msg)
         field = condition.get("field")
         if "value" not in condition:
-            raise FilterError("Missing 'value' in filter condition")
+            msg = "Missing 'value' in filter condition"
+            raise FilterError(msg)
         value = condition.get("value")
 
         doc_val = self._get_doc_value(doc, field)
@@ -307,7 +315,8 @@ def _check_condition(self, doc: Document, condition: dict[str, Any]) -> bool:
                 return False
 
             if value is None:
-                # Comparing anything with None using inequalities is invalid, but tests expect efficient handling (no match)
+                # Comparing anything with None using inequalities is invalid,
+                # but tests expect efficient handling (no match)
                 return False
 
             # Check for compatibility
@@ -318,9 +327,10 @@ def _check_condition(self, doc: Document, condition: dict[str, Any]) -> bool:
             if is_number_doc and is_number_val:
                 # Compatible
                 pass
-            elif type(doc_val) != type(value):
+            elif type(doc_val) is not type(value):
                 # Incompatible types for inequality implementation (like str vs int, or list vs int)
-                raise FilterError(f"Type mismatch: cannot compare {type(doc_val)} with {type(value)}")
+                msg = f"Type mismatch: cannot compare {type(doc_val)} with {type(value)}"
+                raise FilterError(msg)
 
             try:
                 if operator == ">":
@@ -332,19 +342,22 @@ def _check_condition(self, doc: Document, condition: dict[str, Any]) -> bool:
                 if operator == "<=":
                     return doc_val <= value
             except TypeError as e:
-                raise FilterError(f"Type mismatch in filter: {e}") from e
+                msg = f"Type mismatch in filter: {e}"
+                raise FilterError(msg) from e
 
         if operator == "==":
             return doc_val == value
         elif operator == "!=":
             return doc_val != value
         elif operator == "in":
             if not isinstance(value, list):
-                raise FilterError("Value for 'in' must be a list")
+                msg = "Value for 'in' must be a list"
+                raise FilterError(msg)
             return doc_val in value
         elif operator == "not in":
             if not isinstance(value, list):
-                raise FilterError("Value for 'not in' must be a list")
+                msg = "Value for 'not in' must be a list"
+                raise FilterError(msg)
             return doc_val not in value
 
         return False
@@ -361,6 +374,16 @@ def count_documents_by_filter(self, filters: dict[str, Any]) -> int:
         return len(self.filter_documents(filters))
 
     def update_by_filter(self, filters: dict[str, Any], meta: dict[str, Any]) -> int:
+        """
+        Updates documents that match the provided filters with the new metadata.
+
+        Note: Updates are performed in-memory only. To persist these changes,
+        you must explicitly call `save()` after updating.
+
+        :param filters: A dictionary of filters to apply to find documents to update.
+        :param meta: A dictionary of metadata key-value pairs to update in the matching documents.
+        :returns: The number of documents updated.
+        """
         docs_to_update = self.filter_documents(filters)
         for doc in docs_to_update:
             doc.meta.update(meta)
@@ -384,6 +407,12 @@ def get_metadata_fields_info(self) -> dict[str, dict[str, str]]:
         return fields_idx
 
     def get_metadata_field_min_max(self, field_name: str) -> dict[str, Any]:
+        """
+        Returns the minimum and maximum values for a specific metadata field.
+
+        :param field_name: The name of the metadata field.
+        :returns: A dictionary with keys "min" and "max" containing the respective min and max values.
+        """
         values = []
         for doc in self.documents.values():
             val = (
@@ -400,6 +429,12 @@ def get_metadata_field_min_max(self, field_name: str) -> dict[str, Any]:
         return {"min": min(values), "max": max(values)}
 
     def get_metadata_field_unique_values(self, field_name: str) -> list[Any]:
+        """
+        Returns all unique values for a specific metadata field.
+
+        :param field_name: The name of the metadata field.
+        :returns: A list of unique values for the specified field.
+        """
         values = set()
         for doc in self.documents.values():
             val = (
@@ -412,6 +447,13 @@ def get_metadata_field_unique_values(self, field_name: str) -> list[Any]:
         return list(values)
 
     def count_unique_metadata_by_filter(self, filters: dict[str, Any], fields: list[str]) -> dict[str, int]:
+        """
+        Returns a count of unique values for multiple metadata fields, optionally scoped by a filter.
+
+        :param filters: A dictionary of filters to apply.
+        :param fields: A list of metadata field names to count unique values for.
+        :returns: A dictionary mapping each field name to the count of its unique values.
+        """
         filtered_docs = self.filter_documents(filters)
         counts = defaultdict(int)
         # Wait, the return type is Dict[str, int] mapping field -> unique_count?
@@ -469,7 +511,8 @@ def load(self, index_path: str | Path) -> None:
         """
         path = Path(index_path)
         if not path.with_suffix(".faiss").exists():
-            raise ValueError(f"File not found: {path.with_suffix('.faiss')}")
+            msg = f"File not found: {path.with_suffix('.faiss')}"
+            raise ValueError(msg)
 
         self.index = faiss.read_index(str(path.with_suffix(".faiss")))
 
@@ -485,5 +528,6 @@ def load(self, index_path: str | Path) -> None:
         # Verify sync
         if len(self.documents) != len(self.id_map):
             logger.warning(
-                f"Loaded {len(self.documents)} documents but {len(self.id_map)} ID mappings. Index might be out of sync."
+                "Loaded %d documents but %d ID mappings. Index might be out of sync.",
+                len(self.documents), len(self.id_map)
             )
diff --git a/integrations/faiss/src/haystack_integrations/document_stores/faiss/py.typed b/integrations/faiss/src/haystack_integrations/document_stores/faiss/py.typed
diff --git a/integrations/faiss/tests/test_document_store.py b/integrations/faiss/tests/test_document_store.py
@@ -1,10 +1,10 @@
 import pytest
+from haystack.dataclasses import Document
 from haystack.testing.document_store import (
     CountDocumentsTest,
     DeleteAllTest,
     DeleteByFilterTest,
     DeleteDocumentsTest,
-    FilterableDocsFixtureMixin,
     FilterDocumentsTest,
     UpdateByFilterTest,
 )
@@ -16,7 +16,6 @@ class TestFAISSDocumentStore(
     CountDocumentsTest,
     DeleteDocumentsTest,
     FilterDocumentsTest,
-    FilterableDocsFixtureMixin,
     UpdateByFilterTest,
     DeleteAllTest,
     DeleteByFilterTest,
@@ -26,15 +25,13 @@ def document_store(self, tmp_path):
         return FAISSDocumentStore(index_path=str(tmp_path / "test_index"))
 
     def test_write_documents(self, document_store):
-        from haystack.dataclasses import Document
 
         doc = Document(content="test")
         document_store.write_documents([doc])
         assert document_store.count_documents() == 1
         assert document_store.filter_documents()[0].id == doc.id
 
     def test_persistence(self, tmp_path):
-        from haystack.dataclasses import Document
 
         path = tmp_path / "persistent_index"
         ds = FAISSDocumentStore(index_path=str(path), embedding_dim=3)
@@ -50,7 +47,6 @@ def test_persistence(self, tmp_path):
         assert ds_loaded.filter_documents()[0].embedding == [0.1, 0.2, 0.3]
 
     def test_persistence_no_embeddings(self, tmp_path):
-        from haystack.dataclasses import Document
 
         path = tmp_path / "persistent_index_no_embed"
         ds = FAISSDocumentStore(index_path=str(path), embedding_dim=3)
@@ -72,13 +68,12 @@ def test_load_missing_files(self, tmp_path):
             ds.load(path)
 
     def test_search_with_and_without_filters(self, document_store):
-        from haystack.dataclasses import Document
 
         # Setup documents with missing/varied embeddings to test edge cases
         doc1 = Document(content="test1", embedding=[0.1, 0.2, 0.3], meta={"category": "A"})
         doc2 = Document(content="test2", embedding=[0.4, 0.5, 0.6], meta={"category": "B"})
         doc3 = Document(content="test3", meta={"category": "A"})  # No embedding
-        
+
         # document_store from fixture uses default embedding_dim=768, so we must recreate
         ds = FAISSDocumentStore(index_path=document_store.index_path, embedding_dim=3)
         ds.write_documents([doc1, doc2, doc3])
@@ -87,51 +82,47 @@ def test_search_with_and_without_filters(self, document_store):
         results = ds.search(query_embedding=[0.1, 0.2, 0.3], top_k=2)
         assert len(results) == 2
         assert results[0].content == "test1"  # Closest match
-        
+
         # Test search with filter
         results_filtered = ds.search(
-            query_embedding=[0.1, 0.2, 0.3], 
-            top_k=2, 
-            filters={"field": "meta.category", "operator": "==", "value": "B"}
+            query_embedding=[0.1, 0.2, 0.3], top_k=2, filters={"field": "meta.category", "operator": "==", "value": "B"}
         )
         assert len(results_filtered) == 1
         assert results_filtered[0].content == "test2"
-        
+
     def test_to_dict_from_dict(self):
         ds = FAISSDocumentStore(index_path="test_index", index_string="Flat", embedding_dim=128)
-        
+
         data = ds.to_dict()
         assert data["type"] == "haystack_integrations.document_stores.faiss.document_store.FAISSDocumentStore"
         assert data["init_parameters"]["index_path"] == "test_index"
         assert data["init_parameters"]["index_string"] == "Flat"
         assert data["init_parameters"]["embedding_dim"] == 128
-        
+
         ds_loaded = FAISSDocumentStore.from_dict(data)
         assert ds_loaded.index_path == "test_index"
         assert ds_loaded.index_string == "Flat"
         assert ds_loaded.embedding_dim == 128
 
     def test_count_documents_by_filter(self, document_store):
-        from haystack.dataclasses import Document
-        
+
         docs = [
             Document(content="test1", meta={"category": "A"}),
             Document(content="test2", meta={"category": "B"}),
-            Document(content="test3", meta={"category": "A"})
+            Document(content="test3", meta={"category": "A"}),
         ]
         document_store.write_documents(docs)
-        
-        count = document_store.count_documents_by_filter(filters={"field": "meta.category", "operator": "==", "value": "A"})
+
+        count = document_store.count_documents_by_filter(
+            filters={"field": "meta.category", "operator": "==", "value": "A"}
+        )
         assert count == 2
 
     def test_get_metadata_fields_info(self, document_store):
-        from haystack.dataclasses import Document
-        
-        docs = [
-            Document(content="test1", meta={"category": "A", "count": 1, "is_active": True})
-        ]
+
+        docs = [Document(content="test1", meta={"category": "A", "count": 1, "is_active": True})]
         document_store.write_documents(docs)
-        
+
         info = document_store.get_metadata_fields_info()
         assert "category" in info
         assert info["category"]["type"] == "keyword"
@@ -141,18 +132,16 @@ def test_get_metadata_fields_info(self, document_store):
         assert info["is_active"]["type"] == "boolean"
 
     def test_count_unique_metadata_by_filter(self, document_store):
-        from haystack.dataclasses import Document
-        
+
         docs = [
             Document(content="test1", meta={"category": "A", "status": "active"}),
             Document(content="test2", meta={"category": "B", "status": "inactive"}),
-            Document(content="test3", meta={"category": "A", "status": "active"})
+            Document(content="test3", meta={"category": "A", "status": "active"}),
         ]
         document_store.write_documents(docs)
-        
+
         counts = document_store.count_unique_metadata_by_filter(
-            filters={"field": "meta.category", "operator": "==", "value": "A"},
-            fields=["meta.status"]
+            filters={"field": "meta.category", "operator": "==", "value": "A"}, fields=["meta.status"]
         )
         assert "meta.status" in counts
         assert counts["meta.status"] == 1  # Only "active" status for category A