updating tests

davidsbatista · davidsbatista · commit 0c0f31cb47b6 · 2026-01-05T23:48:22.000+01:00
diff --git a/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py b/integrations/opensearch/src/haystack_integrations/document_stores/opensearch/document_store.py
@@ -2,16 +2,19 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import json
 from collections.abc import Mapping
 from math import exp
-from typing import Any, Optional, Union
+from typing import Any, Literal, Optional, Union
 
+import requests
 from haystack import default_from_dict, default_to_dict, logging
 from haystack.dataclasses import Document
 from haystack.document_stores.errors import DocumentStoreError, DuplicateDocumentError
 from haystack.document_stores.types import DuplicatePolicy
 from haystack.utils.auth import Secret
 from opensearchpy import AsyncHttpConnection, AsyncOpenSearch, OpenSearch
+from opensearchpy.exceptions import SerializationError
 from opensearchpy.helpers import async_bulk, bulk
 
 from haystack_integrations.document_stores.opensearch.auth import AsyncAWSAuth, AWSAuth
@@ -21,6 +24,8 @@
 
 Hosts = Union[str, list[Union[str, Mapping[str, Union[str, int]]]]]
 
+ResponseFormat = Literal["json", "jdbc", "csv", "raw"]
+
 # document scores are essentially unbounded and will be scaled to values between 0 and 1 if scale_score is set to
 # True. Scaling uses the expit function (inverse of the logit function) after applying a scaling factor
 # (e.g., BM25_SCALING_FACTOR for the bm25_retrieval method).
@@ -1309,7 +1314,7 @@ def get_field_unique_values(
         field_name = self._normalize_metadata_field_name(metadata_field)
 
         # filter by search_term if provided
-        query = {"match_all": {}}
+        query: dict[str, Any] = {"match_all": {}}
         if search_term:
             # Use match_phrase for exact phrase matching to avoid tokenization issues
             query = {"match_phrase": {"content": search_term}}
@@ -1370,7 +1375,7 @@ async def get_field_unique_values_async(
         field_name = self._normalize_metadata_field_name(metadata_field)
 
         # filter by search_term if provided
-        query = {"match_all": {}}
+        query: dict[str, Any] = {"match_all": {}}
         if search_term:
             # Use match_phrase for exact phrase matching to avoid tokenization issues
             query = {"match_phrase": {"content": search_term}}
@@ -1413,5 +1418,161 @@ async def get_field_unique_values_async(
 
         return unique_values, total_count
 
-    def query_sql(self, query: str):
-        pass
+    def query_sql(self, query: str, response_format: ResponseFormat = "json") -> Any:
+        """
+        Execute a raw OpenSearch SQL query against the index.
+
+        :param query: The OpenSearch SQL query to execute
+        :param response_format: The format of the response. See https://docs.opensearch.org/latest/search-plugins/sql/response-formats/
+        :returns: The query results in the specified format. For JSON format, returns a list of dictionaries
+            (the _source from each hit). For other formats (csv, jdbc, raw), returns the response as text.
+        """
+        self._ensure_initialized()
+        assert self._client is not None
+
+        # For non-JSON formats, use requests directly to avoid deserialization issues
+        if response_format != "json":
+            try:
+                # Get connection info from the transport
+                connection = self._client.transport.get_connection()
+                base_url = connection.host
+                url = f"{base_url}/_plugins/_sql?format={response_format}"
+                
+                headers = {"Content-Type": "application/json"}
+                auth = None
+                if self._http_auth:
+                    if isinstance(self._http_auth, tuple):
+                        auth = self._http_auth
+                    elif isinstance(self._http_auth, AWSAuth):
+                        # For AWS auth, we need to use the opensearchpy client
+                        # Fall through to the try/except below
+                        pass
+                
+                verify = self._verify_certs if self._verify_certs is not None else True
+                timeout = self._timeout if self._timeout is not None else 30.0
+                response = requests.post(
+                    url,
+                    json={"query": query},
+                    headers=headers,
+                    auth=auth,
+                    verify=verify,
+                    timeout=timeout,
+                )
+                response.raise_for_status()
+                return response.text
+            except Exception as e:
+                # If requests fails (e.g., AWS auth), fall back to opensearchpy
+                # which will raise SerializationError that we can handle
+                pass
+        
+        try:
+            body = {"query": query}
+            params = {"format": response_format}
+            
+            response_data = self._client.transport.perform_request(
+                method="POST",
+                url="/_plugins/_sql",
+                params=params,
+                body=body,
+            )
+
+            if response_format == "json":
+                # extract only the query results
+                if isinstance(response_data, dict) and "hits" in response_data:
+                    hits = response_data.get("hits", {}).get("hits", [])
+                    # extract _source from each hit, which contains the actual document data
+                    return [hit.get("_source", {}) for hit in hits]
+                return response_data
+            else:
+                return response_data if isinstance(response_data, str) else str(response_data)
+        except SerializationError:
+            # If we get here, it means requests failed above (likely AWS auth)
+            # and opensearchpy can't deserialize the response
+            # Re-raise as DocumentStoreError with a helpful message
+            msg = f"Failed to execute SQL query in OpenSearch: Unable to deserialize {response_format} response. This format may not be supported with the current authentication method."
+            raise DocumentStoreError(msg) from None
+        except Exception as e:
+            msg = f"Failed to execute SQL query in OpenSearch: {e!s}"
+            raise DocumentStoreError(msg) from e
+
+    async def query_sql_async(self, query: str, response_format: ResponseFormat = "json") -> Any:
+        """
+        Asynchronously execute a raw OpenSearch SQL query against the index.
+
+        :param query: The OpenSearch SQL query to execute
+        :param response_format: The format of the response. See https://docs.opensearch.org/latest/search-plugins/sql/response-formats/
+        :returns: The query results in the specified format. For JSON format, returns a list of dictionaries
+            (the _source from each hit). For other formats (csv, jdbc, raw), returns the response as text.
+        """
+        await self._ensure_initialized_async()
+        assert self._async_client is not None
+
+        # For non-JSON formats, use httpx directly to avoid deserialization issues
+        if response_format != "json":
+            try:
+                import httpx
+                
+                # Get connection info from the transport
+                connection = self._async_client.transport.get_connection()
+                base_url = connection.host
+                url = f"{base_url}/_plugins/_sql?format={response_format}"
+                
+                headers = {"Content-Type": "application/json"}
+                auth = None
+                if self._http_auth:
+                    if isinstance(self._http_auth, tuple):
+                        auth = self._http_auth
+                    elif isinstance(self._http_auth, AWSAuth):
+                        # For AWS auth, we need to use the opensearchpy client
+                        # Fall through to the try/except below
+                        pass
+                
+                verify = self._verify_certs if self._verify_certs is not None else True
+                timeout = httpx.Timeout(self._timeout if self._timeout else 30.0)
+                
+                async with httpx.AsyncClient(verify=verify, timeout=timeout) as client:
+                    response = await client.post(
+                        url,
+                        json={"query": query},
+                        headers=headers,
+                        auth=auth,
+                    )
+                    response.raise_for_status()
+                    return response.text
+            except ImportError:
+                # httpx not available, fall through to opensearchpy
+                pass
+            except Exception as e:
+                # If httpx fails (e.g., AWS auth), fall back to opensearchpy
+                # which will raise SerializationError that we can handle
+                pass
+
+        try:
+            body = {"query": query}
+            params = {"format": response_format}
+            
+            response_data = await self._async_client.transport.perform_request(
+                method="POST",
+                url="/_plugins/_sql",
+                params=params,
+                body=body,
+            )
+
+            if response_format == "json":
+                # extract only the query results
+                if isinstance(response_data, dict) and "hits" in response_data:
+                    hits = response_data.get("hits", {}).get("hits", [])
+                    # extract _source from each hit, which contains the actual document data
+                    return [hit.get("_source", {}) for hit in hits]
+                return response_data
+            else:
+                return response_data if isinstance(response_data, str) else str(response_data)
+        except SerializationError:
+            # If we get here, it means httpx failed above (likely AWS auth or not installed)
+            # and opensearchpy can't deserialize the response
+            # Re-raise as DocumentStoreError with a helpful message
+            msg = f"Failed to execute SQL query in OpenSearch: Unable to deserialize {response_format} response. This format may not be supported with the current authentication method. Consider installing httpx for better support."
+            raise DocumentStoreError(msg) from None
+        except Exception as e:
+            msg = f"Failed to execute SQL query in OpenSearch: {e!s}"
+            raise DocumentStoreError(msg) from e
diff --git a/integrations/opensearch/tests/test_document_store.py b/integrations/opensearch/tests/test_document_store.py
@@ -772,3 +772,65 @@ def test_get_field_unique_values(self, document_store: OpenSearchDocumentStore):
         )
         assert set(unique_priorities_filtered) == {"1"}
         assert priority_count == 1
+
+    def test_query_sql(self, document_store: OpenSearchDocumentStore):
+        """
+        Test executing SQL queries against the OpenSearch index.
+        """
+        docs = [
+            Document(content="Python programming", meta={"category": "A", "status": "active", "priority": 1}),
+            Document(content="Java programming", meta={"category": "B", "status": "active", "priority": 2}),
+            Document(content="Python scripting", meta={"category": "A", "status": "inactive", "priority": 3}),
+            Document(content="JavaScript development", meta={"category": "C", "status": "active", "priority": 1}),
+        ]
+        document_store.write_documents(docs)
+        time.sleep(1)  # Wait for documents to be indexed
+
+        # Test SQL query with JSON format (default)
+        sql_query = (
+            f"SELECT content, category, status, priority FROM {document_store._index} "  # noqa: S608
+            f"WHERE category = 'A' ORDER BY priority"
+        )
+        result = document_store.query_sql(sql_query, response_format="json")
+
+        # New format returns a list of dictionaries (the _source from each hit)
+        assert len(result) == 2  # Two documents with category A
+        assert isinstance(result, list)
+        assert all(isinstance(row, dict) for row in result)
+
+        # Verify data contains expected values
+        categories = [row.get("category") for row in result]
+        assert all(cat == "A" for cat in categories)
+
+        # Verify all expected fields are present
+        for row in result:
+            assert "content" in row
+            assert "category" in row
+            assert "status" in row
+            assert "priority" in row
+
+        # Test SQL query with CSV format
+        result_csv = document_store.query_sql(sql_query, response_format="csv")
+        assert isinstance(result_csv, str)
+        assert "content" in result_csv
+        assert "category" in result_csv
+
+        # Test SQL query with JDBC format
+        result_jdbc = document_store.query_sql(sql_query, response_format="jdbc")
+        # JDBC format can be dict or str depending on OpenSearch version
+        assert result_jdbc is not None
+
+        # Test SQL query with RAW format
+        result_raw = document_store.query_sql(sql_query, response_format="raw")
+        assert isinstance(result_raw, str)
+
+        # Test COUNT query
+        count_query = f"SELECT COUNT(*) as total FROM {document_store._index}"  # noqa: S608
+        count_result = document_store.query_sql(count_query, response_format="json")
+        # COUNT query may return different format, check it's a valid response
+        assert count_result is not None
+
+        # Test error handling for invalid SQL query
+        invalid_query = "SELECT * FROM non_existent_index"
+        with pytest.raises(DocumentStoreError, match="Failed to execute SQL query"):
+            document_store.query_sql(invalid_query)
diff --git a/integrations/opensearch/tests/test_document_store_async.py b/integrations/opensearch/tests/test_document_store_async.py
@@ -6,6 +6,7 @@
 
 import pytest
 from haystack.dataclasses import Document
+from haystack.document_stores.errors import DocumentStoreError
 from haystack.document_stores.types import DuplicatePolicy
 
 from haystack_integrations.document_stores.opensearch.document_store import OpenSearchDocumentStore
@@ -572,3 +573,66 @@ async def test_get_field_unique_values(self, document_store: OpenSearchDocumentS
         )
         assert set(unique_priorities_filtered) == {"1"}
         assert priority_count == 1
+
+    @pytest.mark.asyncio
+    async def test_query_sql(self, document_store: OpenSearchDocumentStore):
+        """
+        Test executing SQL queries against the OpenSearch index.
+        """
+        docs = [
+            Document(content="Python programming", meta={"category": "A", "status": "active", "priority": 1}),
+            Document(content="Java programming", meta={"category": "B", "status": "active", "priority": 2}),
+            Document(content="Python scripting", meta={"category": "A", "status": "inactive", "priority": 3}),
+            Document(content="JavaScript development", meta={"category": "C", "status": "active", "priority": 1}),
+        ]
+        await document_store.write_documents_async(docs)
+        time.sleep(1)  # Wait for documents to be indexed
+
+        # Test SQL query with JSON format (default)
+        sql_query = (
+            f"SELECT content, category, status, priority FROM {document_store._index} "  # noqa: S608
+            f"WHERE category = 'A' ORDER BY priority"
+        )
+        result = await document_store.query_sql_async(sql_query, response_format="json")
+
+        # New format returns a list of dictionaries (the _source from each hit)
+        assert len(result) == 2  # Two documents with category A
+        assert isinstance(result, list)
+        assert all(isinstance(row, dict) for row in result)
+
+        # Verify data contains expected values
+        categories = [row.get("category") for row in result]
+        assert all(cat == "A" for cat in categories)
+
+        # Verify all expected fields are present
+        for row in result:
+            assert "content" in row
+            assert "category" in row
+            assert "status" in row
+            assert "priority" in row
+
+        # Test SQL query with CSV format
+        result_csv = await document_store.query_sql_async(sql_query, response_format="csv")
+        assert isinstance(result_csv, str)
+        assert "content" in result_csv
+        assert "category" in result_csv
+
+        # Test SQL query with JDBC format
+        result_jdbc = await document_store.query_sql_async(sql_query, response_format="jdbc")
+        # JDBC format can be dict or str depending on OpenSearch version
+        assert result_jdbc is not None
+
+        # Test SQL query with RAW format
+        result_raw = await document_store.query_sql_async(sql_query, response_format="raw")
+        assert isinstance(result_raw, str)
+
+        # Test COUNT query
+        count_query = f"SELECT COUNT(*) as total FROM {document_store._index}"  # noqa: S608
+        count_result = await document_store.query_sql_async(count_query, response_format="json")
+        # COUNT query may return different format, check it's a valid response
+        assert count_result is not None
+
+        # Test error handling for invalid SQL query
+        invalid_query = "SELECT * FROM non_existent_index"
+        with pytest.raises(DocumentStoreError, match="Failed to execute SQL query"):
+            await document_store.query_sql_async(invalid_query)