Skip to content

Commit e0be21f

Browse files
committed
adding get_field_unique_values
1 parent 310846d commit e0be21f

2 files changed

Lines changed: 119 additions & 2 deletions

File tree

integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1294,7 +1294,63 @@ async def get_field_min_max_async(self, metadata_field: str) -> dict[str, Any]:
12941294
def get_field_unique_values(
12951295
self, metadata_field: str, search_term: str | None, from_: int, size: int
12961296
) -> tuple[list[str], int]:
1297-
pass
1297+
"""
1298+
Returns unique values for a metadata field, optionally filtered by a search term in the content.
1299+
1300+
:param metadata_field: The metadata field to get unique values for.
1301+
:param search_term: Optional search term to filter documents by matching in the content field.
1302+
:param from_: The starting index for pagination.
1303+
:param size: The number of unique values to return.
1304+
:returns: A tuple containing (list of unique values, total count of unique values).
1305+
"""
1306+
self._ensure_initialized()
1307+
assert self._client is not None
1308+
1309+
field_name = self._normalize_metadata_field_name(metadata_field)
1310+
1311+
# filter by search_term if provided
1312+
query = {"match_all": {}}
1313+
if search_term:
1314+
# Use match_phrase for exact phrase matching to avoid tokenization issues
1315+
query = {"match_phrase": {"content": search_term}}
1316+
1317+
# Build aggregations
1318+
# Terms aggregation for paginated unique values
1319+
# Note: Terms aggregation doesn't support 'from' parameter directly,
1320+
# so we fetch from_ + size results and slice them
1321+
# Cardinality aggregation for total count
1322+
terms_size = from_ + size if from_ > 0 else size
1323+
body = {
1324+
"query": query,
1325+
"aggs": {
1326+
"unique_values": {
1327+
"terms": {
1328+
"field": field_name,
1329+
"size": terms_size,
1330+
}
1331+
},
1332+
"total_count": {
1333+
"cardinality": {
1334+
"field": field_name,
1335+
}
1336+
},
1337+
},
1338+
"size": 0, # we only need aggregations, not documents
1339+
}
1340+
1341+
result = self._client.search(index=self._index, body=body)
1342+
aggregations = result.get("aggregations", {})
1343+
1344+
# Extract unique values from terms aggregation buckets
1345+
unique_values_buckets = aggregations.get("unique_values", {}).get("buckets", [])
1346+
# Apply pagination by slicing the results
1347+
paginated_buckets = unique_values_buckets[from_ : from_ + size]
1348+
unique_values = [str(bucket["key"]) for bucket in paginated_buckets]
1349+
1350+
# Extract total count from cardinality aggregation
1351+
total_count = int(aggregations.get("total_count", {}).get("value", 0))
1352+
1353+
return unique_values, total_count
12981354

12991355
def query_sql(self, query: str):
13001356
pass

integrations/opensearch/tests/test_document_store.py

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -708,4 +708,65 @@ def test_get_field_min_max(self, document_store: OpenSearchDocumentStore):
708708
# Test with float values
709709
min_max_score = document_store.get_field_min_max("meta.rating")
710710
assert min_max_score["min"] == pytest.approx(5.2)
711-
assert min_max_score["max"] == pytest.approx(20.3)
711+
assert min_max_score["max"] == pytest.approx(20.3)
712+
713+
def test_get_field_unique_values(self, document_store: OpenSearchDocumentStore):
714+
# Test with string values
715+
docs = [
716+
Document(content="Python programming", meta={"category": "A", "language": "Python"}),
717+
Document(content="Java programming", meta={"category": "B", "language": "Java"}),
718+
Document(content="Python scripting", meta={"category": "A", "language": "Python"}),
719+
Document(content="JavaScript development", meta={"category": "C", "language": "JavaScript"}),
720+
Document(content="Python data science", meta={"category": "A", "language": "Python"}),
721+
Document(content="Java backend", meta={"category": "B", "language": "Java"}),
722+
]
723+
document_store.write_documents(docs)
724+
725+
# Test getting all unique values without search term
726+
unique_values, total_count = document_store.get_field_unique_values("meta.category", None, 0, 10)
727+
assert set(unique_values) == {"A", "B", "C"}
728+
assert total_count == 3
729+
730+
# Test with "meta." prefix
731+
unique_languages, lang_count = document_store.get_field_unique_values("meta.language", None, 0, 10)
732+
assert set(unique_languages) == {"Python", "Java", "JavaScript"}
733+
assert lang_count == 3
734+
735+
# Test pagination - first page
736+
unique_values_page1, total_count = document_store.get_field_unique_values("meta.category", None, 0, 2)
737+
assert len(unique_values_page1) == 2
738+
assert total_count == 3
739+
assert all(val in ["A", "B", "C"] for val in unique_values_page1)
740+
741+
# Test pagination - second page
742+
unique_values_page2, total_count = document_store.get_field_unique_values("meta.category", None, 2, 2)
743+
assert len(unique_values_page2) == 1
744+
assert total_count == 3
745+
assert unique_values_page2[0] in ["A", "B", "C"]
746+
747+
# Test with search term - filter by content matching "Python"
748+
unique_values_filtered, total_count = document_store.get_field_unique_values("meta.category", "Python", 0, 10)
749+
assert set(unique_values_filtered) == {"A"} # Only category A has documents with "Python" in content
750+
assert total_count == 1
751+
752+
# Test with search term - filter by content matching "Java"
753+
unique_values_java, total_count = document_store.get_field_unique_values("meta.category", "Java", 0, 10)
754+
assert set(unique_values_java) == {"B"} # Only category B has documents with "Java" in content
755+
assert total_count == 1
756+
757+
# Test with integer values
758+
int_docs = [
759+
Document(content="Doc 1", meta={"priority": 1}),
760+
Document(content="Doc 2", meta={"priority": 2}),
761+
Document(content="Doc 3", meta={"priority": 1}),
762+
Document(content="Doc 4", meta={"priority": 3}),
763+
]
764+
document_store.write_documents(int_docs)
765+
unique_priorities, priority_count = document_store.get_field_unique_values("meta.priority", None, 0, 10)
766+
assert set(unique_priorities) == {"1", "2", "3"}
767+
assert priority_count == 3
768+
769+
# Test with search term on integer field
770+
unique_priorities_filtered, priority_count = document_store.get_field_unique_values("meta.priority", "Doc 1", 0, 10)
771+
assert set(unique_priorities_filtered) == {"1"}
772+
assert priority_count == 1

0 commit comments

Comments
 (0)