Skip to content

Commit 0451da9

Browse files
committed
fix: resolve lint errors, add py.typed, and clean up test mixins
- Fixed Ruff E501, EM101, EM102, B905, E721, and PLC0415 - Added empty py.typed marker to src package - Removed redundant FilterableDocsFixtureMixin from TestFAISSDocumentStore
1 parent 583f6ae commit 0451da9

3 files changed

Lines changed: 91 additions & 58 deletions

File tree

integrations/faiss/src/haystack_integrations/document_stores/faiss/document_store.py

Lines changed: 71 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
15
import json
26
import logging
37
from collections import defaultdict
@@ -60,9 +64,8 @@ def _create_new_index(self):
6064
base_index = faiss.index_factory(self.embedding_dim, self.index_string)
6165
self.index = faiss.IndexIDMap(base_index)
6266
except RuntimeError as e:
63-
raise DocumentStoreError(
64-
f"Could not create FAISS index with factory string '{self.index_string}': {e}"
65-
) from e
67+
error_msg = f"Could not create FAISS index with factory string '{self.index_string}': {e}"
68+
raise DocumentStoreError(error_msg) from e
6669

6770
def count_documents(self) -> int:
6871
"""
@@ -89,8 +92,10 @@ def filter_documents(self, filters: dict[str, Any] | None = None) -> list[Docume
8992
def _matches_filters(self, doc: Document, filters: dict[str, Any]) -> bool:
9093
"""
9194
Checks if a document matches the given filters.
92-
Currently supports simple equality check for 'field' == 'value', and logical operators AND/OR/NOT are NOT fully implemented in this MVP helper.
93-
Wait, Haystack 2.x filters are complex. We should use a proper filter parser or a simple recusive check if we want to support full syntax.
95+
Currently supports simple equality check for 'field' == 'value'.
96+
Logical operators AND/OR/NOT are NOT fully implemented in this MVP helper.
97+
Wait, Haystack 2.x filters are complex. We should use a proper filter parser
98+
or a simple recusive check if we want to support full syntax.
9499
For MVP, let's implement basic filtering logic.
95100
"""
96101
if "operator" not in filters:
@@ -130,7 +135,8 @@ def write_documents(self, documents: list[Document], policy: DuplicatePolicy = D
130135
if policy == DuplicatePolicy.FAIL:
131136
for doc in documents:
132137
if doc.id in self.documents:
133-
raise DuplicateDocumentError(f"Document with id '{doc.id}' already exists.")
138+
msg = f"Document with id '{doc.id}' already exists."
139+
raise DuplicateDocumentError(msg)
134140

135141
# Process documents
136142
ids_to_add_to_index = []
@@ -223,8 +229,8 @@ def search(
223229

224230
# Search in FAISS
225231
# Valid strategy for pre-filtering vs post-filtering:
226-
# Since FAISS `IndexIDMap` doesn't support pre-filtering natively comfortably without `RangeSearch` or specialized impls,
227-
# we usually fetch more (k * scale_factor) and filter post-retrieval.
232+
# Since FAISS `IndexIDMap` doesn't support pre-filtering natively comfortably
233+
# without `RangeSearch` or specialized impls, we fetch more and filter post-retrieval.
228234

229235
fetch_k = top_k
230236
if filters:
@@ -233,7 +239,7 @@ def search(
233239
distances, indices = self.index.search(query_vec, fetch_k)
234240

235241
results = []
236-
for dist, int_id in zip(distances[0], indices[0]):
242+
for dist, int_id in zip(distances[0], indices[0], strict=False):
237243
if int_id == -1:
238244
continue
239245

@@ -262,40 +268,42 @@ def search(
262268

263269
return results
264270

265-
def _get_result_to_documents(self, result) -> list[Document]:
266-
# Compatibility/Helper if matching Chroma approach
267-
return []
268-
269271
def _check_condition(self, doc: Document, condition: dict[str, Any]) -> bool:
270272
if "operator" not in condition and "conditions" not in condition:
271273
# This might be a legacy or malformed filter from tests like test_missing_top_level_operator_key
272274
# The standard Haystack filter structure enforces keys.
273275
# On failure to parse standard structure, we should raise FilterError as per tests?
274276
# Actually, looking at the tests (e.g. TestFAISSDocumentStore.test_missing_top_level_operator_key),
275277
# they expect FilterError if "operator" is missing from a condition block.
276-
raise FilterError("Filter condition missing 'operator'")
278+
msg = "Filter condition missing 'operator'"
279+
raise FilterError(msg)
277280

278281
operator = condition.get("operator", "==")
279282

280283
if operator == "AND":
281284
if "conditions" not in condition:
282-
raise FilterError("Missing 'conditions' for AND operator")
285+
msg = "Missing 'conditions' for AND operator"
286+
raise FilterError(msg)
283287
return all(self._check_condition(doc, cond) for cond in condition.get("conditions", []))
284288
elif operator == "OR":
285289
if "conditions" not in condition:
286-
raise FilterError("Missing 'conditions' for OR operator")
290+
msg = "Missing 'conditions' for OR operator"
291+
raise FilterError(msg)
287292
return any(self._check_condition(doc, cond) for cond in condition.get("conditions", []))
288293
elif operator == "NOT":
289294
if "conditions" not in condition:
290-
raise FilterError("Missing 'conditions' for NOT operator")
295+
msg = "Missing 'conditions' for NOT operator"
296+
raise FilterError(msg)
291297
return not self._check_condition(doc, condition.get("conditions", [])[0])
292298

293299
# Leaf condition
294300
if "field" not in condition:
295-
raise FilterError("Missing 'field' in filter condition")
301+
msg = "Missing 'field' in filter condition"
302+
raise FilterError(msg)
296303
field = condition.get("field")
297304
if "value" not in condition:
298-
raise FilterError("Missing 'value' in filter condition")
305+
msg = "Missing 'value' in filter condition"
306+
raise FilterError(msg)
299307
value = condition.get("value")
300308

301309
doc_val = self._get_doc_value(doc, field)
@@ -307,7 +315,8 @@ def _check_condition(self, doc: Document, condition: dict[str, Any]) -> bool:
307315
return False
308316

309317
if value is None:
310-
# Comparing anything with None using inequalities is invalid, but tests expect efficient handling (no match)
318+
# Comparing anything with None using inequalities is invalid,
319+
# but tests expect efficient handling (no match)
311320
return False
312321

313322
# Check for compatibility
@@ -318,9 +327,10 @@ def _check_condition(self, doc: Document, condition: dict[str, Any]) -> bool:
318327
if is_number_doc and is_number_val:
319328
# Compatible
320329
pass
321-
elif type(doc_val) != type(value):
330+
elif type(doc_val) is not type(value):
322331
# Incompatible types for inequality implementation (like str vs int, or list vs int)
323-
raise FilterError(f"Type mismatch: cannot compare {type(doc_val)} with {type(value)}")
332+
msg = f"Type mismatch: cannot compare {type(doc_val)} with {type(value)}"
333+
raise FilterError(msg)
324334

325335
try:
326336
if operator == ">":
@@ -332,19 +342,22 @@ def _check_condition(self, doc: Document, condition: dict[str, Any]) -> bool:
332342
if operator == "<=":
333343
return doc_val <= value
334344
except TypeError as e:
335-
raise FilterError(f"Type mismatch in filter: {e}") from e
345+
msg = f"Type mismatch in filter: {e}"
346+
raise FilterError(msg) from e
336347

337348
if operator == "==":
338349
return doc_val == value
339350
elif operator == "!=":
340351
return doc_val != value
341352
elif operator == "in":
342353
if not isinstance(value, list):
343-
raise FilterError("Value for 'in' must be a list")
354+
msg = "Value for 'in' must be a list"
355+
raise FilterError(msg)
344356
return doc_val in value
345357
elif operator == "not in":
346358
if not isinstance(value, list):
347-
raise FilterError("Value for 'not in' must be a list")
359+
msg = "Value for 'not in' must be a list"
360+
raise FilterError(msg)
348361
return doc_val not in value
349362

350363
return False
@@ -361,6 +374,16 @@ def count_documents_by_filter(self, filters: dict[str, Any]) -> int:
361374
return len(self.filter_documents(filters))
362375

363376
def update_by_filter(self, filters: dict[str, Any], meta: dict[str, Any]) -> int:
377+
"""
378+
Updates documents that match the provided filters with the new metadata.
379+
380+
Note: Updates are performed in-memory only. To persist these changes,
381+
you must explicitly call `save()` after updating.
382+
383+
:param filters: A dictionary of filters to apply to find documents to update.
384+
:param meta: A dictionary of metadata key-value pairs to update in the matching documents.
385+
:returns: The number of documents updated.
386+
"""
364387
docs_to_update = self.filter_documents(filters)
365388
for doc in docs_to_update:
366389
doc.meta.update(meta)
@@ -384,6 +407,12 @@ def get_metadata_fields_info(self) -> dict[str, dict[str, str]]:
384407
return fields_idx
385408

386409
def get_metadata_field_min_max(self, field_name: str) -> dict[str, Any]:
410+
"""
411+
Returns the minimum and maximum values for a specific metadata field.
412+
413+
:param field_name: The name of the metadata field.
414+
:returns: A dictionary with keys "min" and "max" containing the respective min and max values.
415+
"""
387416
values = []
388417
for doc in self.documents.values():
389418
val = (
@@ -400,6 +429,12 @@ def get_metadata_field_min_max(self, field_name: str) -> dict[str, Any]:
400429
return {"min": min(values), "max": max(values)}
401430

402431
def get_metadata_field_unique_values(self, field_name: str) -> list[Any]:
432+
"""
433+
Returns all unique values for a specific metadata field.
434+
435+
:param field_name: The name of the metadata field.
436+
:returns: A list of unique values for the specified field.
437+
"""
403438
values = set()
404439
for doc in self.documents.values():
405440
val = (
@@ -412,6 +447,13 @@ def get_metadata_field_unique_values(self, field_name: str) -> list[Any]:
412447
return list(values)
413448

414449
def count_unique_metadata_by_filter(self, filters: dict[str, Any], fields: list[str]) -> dict[str, int]:
450+
"""
451+
Returns a count of unique values for multiple metadata fields, optionally scoped by a filter.
452+
453+
:param filters: A dictionary of filters to apply.
454+
:param fields: A list of metadata field names to count unique values for.
455+
:returns: A dictionary mapping each field name to the count of its unique values.
456+
"""
415457
filtered_docs = self.filter_documents(filters)
416458
counts = defaultdict(int)
417459
# Wait, the return type is Dict[str, int] mapping field -> unique_count?
@@ -469,7 +511,8 @@ def load(self, index_path: str | Path) -> None:
469511
"""
470512
path = Path(index_path)
471513
if not path.with_suffix(".faiss").exists():
472-
raise ValueError(f"File not found: {path.with_suffix('.faiss')}")
514+
msg = f"File not found: {path.with_suffix('.faiss')}"
515+
raise ValueError(msg)
473516

474517
self.index = faiss.read_index(str(path.with_suffix(".faiss")))
475518

@@ -485,5 +528,6 @@ def load(self, index_path: str | Path) -> None:
485528
# Verify sync
486529
if len(self.documents) != len(self.id_map):
487530
logger.warning(
488-
f"Loaded {len(self.documents)} documents but {len(self.id_map)} ID mappings. Index might be out of sync."
531+
"Loaded %d documents but %d ID mappings. Index might be out of sync.",
532+
len(self.documents), len(self.id_map)
489533
)

integrations/faiss/src/haystack_integrations/document_stores/faiss/py.typed

Whitespace-only changes.
Lines changed: 20 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
import pytest
2+
from haystack.dataclasses import Document
23
from haystack.testing.document_store import (
34
CountDocumentsTest,
45
DeleteAllTest,
56
DeleteByFilterTest,
67
DeleteDocumentsTest,
7-
FilterableDocsFixtureMixin,
88
FilterDocumentsTest,
99
UpdateByFilterTest,
1010
)
@@ -16,7 +16,6 @@ class TestFAISSDocumentStore(
1616
CountDocumentsTest,
1717
DeleteDocumentsTest,
1818
FilterDocumentsTest,
19-
FilterableDocsFixtureMixin,
2019
UpdateByFilterTest,
2120
DeleteAllTest,
2221
DeleteByFilterTest,
@@ -26,15 +25,13 @@ def document_store(self, tmp_path):
2625
return FAISSDocumentStore(index_path=str(tmp_path / "test_index"))
2726

2827
def test_write_documents(self, document_store):
29-
from haystack.dataclasses import Document
3028

3129
doc = Document(content="test")
3230
document_store.write_documents([doc])
3331
assert document_store.count_documents() == 1
3432
assert document_store.filter_documents()[0].id == doc.id
3533

3634
def test_persistence(self, tmp_path):
37-
from haystack.dataclasses import Document
3835

3936
path = tmp_path / "persistent_index"
4037
ds = FAISSDocumentStore(index_path=str(path), embedding_dim=3)
@@ -50,7 +47,6 @@ def test_persistence(self, tmp_path):
5047
assert ds_loaded.filter_documents()[0].embedding == [0.1, 0.2, 0.3]
5148

5249
def test_persistence_no_embeddings(self, tmp_path):
53-
from haystack.dataclasses import Document
5450

5551
path = tmp_path / "persistent_index_no_embed"
5652
ds = FAISSDocumentStore(index_path=str(path), embedding_dim=3)
@@ -72,13 +68,12 @@ def test_load_missing_files(self, tmp_path):
7268
ds.load(path)
7369

7470
def test_search_with_and_without_filters(self, document_store):
75-
from haystack.dataclasses import Document
7671

7772
# Setup documents with missing/varied embeddings to test edge cases
7873
doc1 = Document(content="test1", embedding=[0.1, 0.2, 0.3], meta={"category": "A"})
7974
doc2 = Document(content="test2", embedding=[0.4, 0.5, 0.6], meta={"category": "B"})
8075
doc3 = Document(content="test3", meta={"category": "A"}) # No embedding
81-
76+
8277
# document_store from fixture uses default embedding_dim=768, so we must recreate
8378
ds = FAISSDocumentStore(index_path=document_store.index_path, embedding_dim=3)
8479
ds.write_documents([doc1, doc2, doc3])
@@ -87,51 +82,47 @@ def test_search_with_and_without_filters(self, document_store):
8782
results = ds.search(query_embedding=[0.1, 0.2, 0.3], top_k=2)
8883
assert len(results) == 2
8984
assert results[0].content == "test1" # Closest match
90-
85+
9186
# Test search with filter
9287
results_filtered = ds.search(
93-
query_embedding=[0.1, 0.2, 0.3],
94-
top_k=2,
95-
filters={"field": "meta.category", "operator": "==", "value": "B"}
88+
query_embedding=[0.1, 0.2, 0.3], top_k=2, filters={"field": "meta.category", "operator": "==", "value": "B"}
9689
)
9790
assert len(results_filtered) == 1
9891
assert results_filtered[0].content == "test2"
99-
92+
10093
def test_to_dict_from_dict(self):
10194
ds = FAISSDocumentStore(index_path="test_index", index_string="Flat", embedding_dim=128)
102-
95+
10396
data = ds.to_dict()
10497
assert data["type"] == "haystack_integrations.document_stores.faiss.document_store.FAISSDocumentStore"
10598
assert data["init_parameters"]["index_path"] == "test_index"
10699
assert data["init_parameters"]["index_string"] == "Flat"
107100
assert data["init_parameters"]["embedding_dim"] == 128
108-
101+
109102
ds_loaded = FAISSDocumentStore.from_dict(data)
110103
assert ds_loaded.index_path == "test_index"
111104
assert ds_loaded.index_string == "Flat"
112105
assert ds_loaded.embedding_dim == 128
113106

114107
def test_count_documents_by_filter(self, document_store):
115-
from haystack.dataclasses import Document
116-
108+
117109
docs = [
118110
Document(content="test1", meta={"category": "A"}),
119111
Document(content="test2", meta={"category": "B"}),
120-
Document(content="test3", meta={"category": "A"})
112+
Document(content="test3", meta={"category": "A"}),
121113
]
122114
document_store.write_documents(docs)
123-
124-
count = document_store.count_documents_by_filter(filters={"field": "meta.category", "operator": "==", "value": "A"})
115+
116+
count = document_store.count_documents_by_filter(
117+
filters={"field": "meta.category", "operator": "==", "value": "A"}
118+
)
125119
assert count == 2
126120

127121
def test_get_metadata_fields_info(self, document_store):
128-
from haystack.dataclasses import Document
129-
130-
docs = [
131-
Document(content="test1", meta={"category": "A", "count": 1, "is_active": True})
132-
]
122+
123+
docs = [Document(content="test1", meta={"category": "A", "count": 1, "is_active": True})]
133124
document_store.write_documents(docs)
134-
125+
135126
info = document_store.get_metadata_fields_info()
136127
assert "category" in info
137128
assert info["category"]["type"] == "keyword"
@@ -141,18 +132,16 @@ def test_get_metadata_fields_info(self, document_store):
141132
assert info["is_active"]["type"] == "boolean"
142133

143134
def test_count_unique_metadata_by_filter(self, document_store):
144-
from haystack.dataclasses import Document
145-
135+
146136
docs = [
147137
Document(content="test1", meta={"category": "A", "status": "active"}),
148138
Document(content="test2", meta={"category": "B", "status": "inactive"}),
149-
Document(content="test3", meta={"category": "A", "status": "active"})
139+
Document(content="test3", meta={"category": "A", "status": "active"}),
150140
]
151141
document_store.write_documents(docs)
152-
142+
153143
counts = document_store.count_unique_metadata_by_filter(
154-
filters={"field": "meta.category", "operator": "==", "value": "A"},
155-
fields=["meta.status"]
144+
filters={"field": "meta.category", "operator": "==", "value": "A"}, fields=["meta.status"]
156145
)
157146
assert "meta.status" in counts
158147
assert counts["meta.status"] == 1 # Only "active" status for category A

0 commit comments

Comments
 (0)