deepset-ai
diff --git a/‎integrations/jina/pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎integrations/jina/pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py‎
Lines changed: 48 additions & 18 deletions b/‎integrations/jina/src/haystack_integrations/components/connectors/jina/reader.py‎
Lines changed: 48 additions & 18 deletions
diff --git a/‎integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py‎
Lines changed: 115 additions & 53 deletions b/‎integrations/jina/src/haystack_integrations/components/embedders/jina/document_embedder.py‎
Lines changed: 115 additions & 53 deletions
@@ -26,7 +26,7 @@ classifiers = [
   "Programming Language :: Python :: Implementation :: CPython",
   "Programming Language :: Python :: Implementation :: PyPy",
 ]
-dependencies = ["requests>=2.25.0", "haystack-ai>=2.22.0"]
+dependencies = ["httpx>=0.28.0", "haystack-ai>=2.22.0"]
 
 [project.urls]
 Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/jina#readme"
 
@@ -2,11 +2,10 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import json
 from typing import Any
 from urllib.parse import quote
 
-import requests
+import httpx
 from haystack import Document, component, default_from_dict, default_to_dict
 from haystack.utils import Secret, deserialize_secrets_inplace
 
@@ -105,19 +104,7 @@ def _json_to_document(self, data: dict) -> Document:
         document = Document(content=content, meta=data)
         return document
 
-    @component.output_types(documents=list[Document])
-    def run(self, query: str, headers: dict[str, str] | None = None) -> dict[str, list[Document]]:
-        """
-        Process the query/URL using the Jina AI reader service.
-
-        :param query: The query string or URL to process.
-        :param headers: Optional headers to include in the request for customization. Refer to the
-            [Jina Reader documentation](https://jina.ai/reader/) for more information.
-
-        :returns:
-            A dictionary with the following keys:
-                - `documents`: A list of `Document` objects.
-        """
+    def _prepare_request(self, query: str, headers: dict[str, str] | None = None) -> tuple[str, dict[str, str]]:
         headers = headers or {}
         headers["Authorization"] = f"Bearer {self.api_key.resolve_value()}"
 
@@ -127,17 +114,60 @@ def run(self, query: str, headers: dict[str, str] | None = None) -> dict[str, li
         endpoint_url = READER_ENDPOINT_URL_BY_MODE[self.mode]
         encoded_target = quote(query, safe="")
         url = f"{endpoint_url}{encoded_target}"
+        return url, headers
 
-        response = requests.get(url, headers=headers, timeout=60)
-
+    def _parse_response(self, response: httpx.Response, query: str) -> dict[str, list[Document]]:
         # raw response: we just return a single Document with text
         if not self.json_response:
             meta = {"content_type": response.headers["Content-Type"], "query": query}
             return {"documents": [Document(content=response.text, meta=meta)]}
 
-        response_json = json.loads(response.content).get("data", {})
+        response_json = response.json().get("data", {})
         if self.mode == JinaReaderMode.SEARCH:
             documents = [self._json_to_document(record) for record in response_json]
             return {"documents": documents}
 
         return {"documents": [self._json_to_document(response_json)]}
+
+    @component.output_types(documents=list[Document])
+    def run(self, query: str, headers: dict[str, str] | None = None) -> dict[str, list[Document]]:
+        """
+        Process the query/URL using the Jina AI reader service.
+
+        :param query: The query string or URL to process.
+        :param headers: Optional headers to include in the request for customization. Refer to the
+            [Jina Reader documentation](https://jina.ai/reader/) for more information.
+
+        :returns:
+            A dictionary with the following keys:
+                - `documents`: A list of `Document` objects.
+        """
+        url, request_headers = self._prepare_request(query, headers)
+
+        with httpx.Client() as client:
+            response = client.get(url, headers=request_headers, timeout=60)
+
+        return self._parse_response(response, query)
+
+    @component.output_types(documents=list[Document])
+    async def run_async(self, query: str, headers: dict[str, str] | None = None) -> dict[str, list[Document]]:
+        """
+        Asynchronously process the query/URL using the Jina AI reader service.
+
+        This is the asynchronous version of the `run` method. It has the same parameters and return values
+        but can be used with `await` in async code.
+
+        :param query: The query string or URL to process.
+        :param headers: Optional headers to include in the request for customization. Refer to the
+            [Jina Reader documentation](https://jina.ai/reader/) for more information.
+
+        :returns:
+            A dictionary with the following keys:
+                - `documents`: A list of `Document` objects.
+        """
+        url, request_headers = self._prepare_request(query, headers)
+
+        async with httpx.AsyncClient() as client:
+            response = await client.get(url, headers=request_headers, timeout=60)
+
+        return self._parse_response(response, query)
@@ -4,7 +4,7 @@
 from dataclasses import replace
 from typing import Any
 
-import requests
+import httpx
 from haystack import Document, component, default_from_dict, default_to_dict
 from haystack.utils import Secret, deserialize_secrets_inplace
 from tqdm import tqdm
@@ -89,14 +89,11 @@ def __init__(
         self.progress_bar = progress_bar
         self.meta_fields_to_embed = meta_fields_to_embed or []
         self.embedding_separator = embedding_separator
-        self._session = requests.Session()
-        self._session.headers.update(
-            {
-                "Authorization": f"Bearer {resolved_api_key}",
-                "Accept-Encoding": "identity",
-                "Content-type": "application/json",
-            }
-        )
+        self._headers = {
+            "Authorization": f"Bearer {resolved_api_key}",
+            "Accept-Encoding": "identity",
+            "Content-type": "application/json",
+        }
         self.task = task
         self.dimensions = dimensions
         self.late_chunking = late_chunking
@@ -164,40 +161,96 @@ def _prepare_texts_to_embed(self, documents: list[Document]) -> list[str]:
             texts_to_embed.append(text_to_embed)
         return texts_to_embed
 
+    def _validate_input(self, documents: list[Document]) -> None:
+        if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
+            msg = (
+                "JinaDocumentEmbedder expects a list of Documents as input."
+                "In case you want to embed a string, please use the JinaTextEmbedder."
+            )
+            raise TypeError(msg)
+
+    def _prepare_parameters(self) -> dict[str, Any]:
+        parameters: dict[str, Any] = {}
+        if self.task is not None:
+            parameters["task"] = self.task
+        if self.dimensions is not None:
+            parameters["dimensions"] = self.dimensions
+        if self.late_chunking is not None:
+            parameters["late_chunking"] = self.late_chunking
+        return parameters
+
+    @staticmethod
+    def _process_batch_response(
+        response: dict[str, Any], all_embeddings: list[list[float]], metadata: dict[str, Any]
+    ) -> None:
+        if "data" not in response:
+            raise RuntimeError(response["detail"])
+
+        # Sort resulting embeddings by index
+        sorted_embeddings = sorted(response["data"], key=lambda e: e["index"])
+        embeddings = [result["embedding"] for result in sorted_embeddings]
+        all_embeddings.extend(embeddings)
+        if "model" not in metadata:
+            metadata["model"] = response["model"]
+        if "usage" not in metadata:
+            metadata["usage"] = dict(response["usage"].items())
+        else:
+            metadata["usage"]["prompt_tokens"] += response["usage"]["prompt_tokens"]
+            metadata["usage"]["total_tokens"] += response["usage"]["total_tokens"]
+
     def _embed_batch(
         self, texts_to_embed: list[str], batch_size: int, parameters: dict | None = None
     ) -> tuple[list[list[float]], dict[str, Any]]:
-        """
-        Embed a list of texts in batches.
-        """
+        """Embed a list of texts in batches."""
+        all_embeddings: list[list[float]] = []
+        metadata: dict[str, Any] = {}
+        with httpx.Client() as client:
+            for i in tqdm(
+                range(0, len(texts_to_embed), batch_size),
+                disable=not self.progress_bar,
+                desc="Calculating embeddings",
+            ):
+                batch = texts_to_embed[i : i + batch_size]
+                response = client.post(
+                    self.base_url,
+                    json={"input": batch, "model": self.model_name, **(parameters or {})},
+                    headers=self._headers,
+                ).json()
+                self._process_batch_response(response, all_embeddings, metadata)
 
-        all_embeddings = []
-        metadata = {}
-        for i in tqdm(
-            range(0, len(texts_to_embed), batch_size), disable=not self.progress_bar, desc="Calculating embeddings"
-        ):
-            batch = texts_to_embed[i : i + batch_size]
-            response = self._session.post(
-                self.base_url,
-                json={"input": batch, "model": self.model_name, **(parameters or {})},
-            ).json()
-            if "data" not in response:
-                raise RuntimeError(response["detail"])
-
-            # Sort resulting embeddings by index
-            sorted_embeddings = sorted(response["data"], key=lambda e: e["index"])
-            embeddings = [result["embedding"] for result in sorted_embeddings]
-            all_embeddings.extend(embeddings)
-            if "model" not in metadata:
-                metadata["model"] = response["model"]
-            if "usage" not in metadata:
-                metadata["usage"] = dict(response["usage"].items())
-            else:
-                metadata["usage"]["prompt_tokens"] += response["usage"]["prompt_tokens"]
-                metadata["usage"]["total_tokens"] += response["usage"]["total_tokens"]
+        return all_embeddings, metadata
+
+    async def _embed_batch_async(
+        self, texts_to_embed: list[str], batch_size: int, parameters: dict | None = None
+    ) -> tuple[list[list[float]], dict[str, Any]]:
+        """Asynchronously embed a list of texts in batches."""
+        all_embeddings: list[list[float]] = []
+        metadata: dict[str, Any] = {}
+        async with httpx.AsyncClient() as client:
+            for i in tqdm(
+                range(0, len(texts_to_embed), batch_size),
+                disable=not self.progress_bar,
+                desc="Calculating embeddings",
+            ):
+                batch = texts_to_embed[i : i + batch_size]
+                response = await client.post(
+                    self.base_url,
+                    json={"input": batch, "model": self.model_name, **(parameters or {})},
+                    headers=self._headers,
+                )
+                self._process_batch_response(response.json(), all_embeddings, metadata)
 
         return all_embeddings, metadata
 
+    @staticmethod
+    def _build_result(
+        documents: list[Document], embeddings: list[list[float]], metadata: dict[str, Any]
+    ) -> dict[str, Any]:
+        new_documents: list[Document] = []
+        for doc, emb in zip(documents, embeddings, strict=True):
+            new_documents.append(replace(doc, embedding=emb))
+        return {"documents": new_documents, "meta": metadata}
+
     @component.output_types(documents=list[Document], meta=dict[str, Any])
     def run(self, documents: list[Document]) -> dict[str, Any]:
         """
@@ -209,27 +262,36 @@ def run(self, documents: list[Document]) -> dict[str, Any]:
             - `meta`: A dictionary with metadata including the model name and usage statistics.
         :raises TypeError: If the input is not a list of Documents.
         """
-        if not isinstance(documents, list) or (documents and not isinstance(documents[0], Document)):
-            msg = (
-                "JinaDocumentEmbedder expects a list of Documents as input."
-                "In case you want to embed a string, please use the JinaTextEmbedder."
-            )
-            raise TypeError(msg)
+        self._validate_input(documents)
 
         texts_to_embed = self._prepare_texts_to_embed(documents=documents)
-        parameters: dict[str, Any] = {}
-        if self.task is not None:
-            parameters["task"] = self.task
-        if self.dimensions is not None:
-            parameters["dimensions"] = self.dimensions
-        if self.late_chunking is not None:
-            parameters["late_chunking"] = self.late_chunking
+        parameters = self._prepare_parameters()
         embeddings, metadata = self._embed_batch(
             texts_to_embed=texts_to_embed, batch_size=self.batch_size, parameters=parameters
         )
 
-        new_documents: list[Document] = []
-        for doc, emb in zip(documents, embeddings, strict=True):
-            new_documents.append(replace(doc, embedding=emb))
+        return self._build_result(documents, embeddings, metadata)
 
-        return {"documents": new_documents, "meta": metadata}
+    @component.output_types(documents=list[Document], meta=dict[str, Any])
+    async def run_async(self, documents: list[Document]) -> dict[str, Any]:
+        """
+        Asynchronously compute the embeddings for a list of Documents.
+
+        This is the asynchronous version of the `run` method. It has the same parameters and return values
+        but can be used with `await` in async code.
+
+        :param documents: A list of Documents to embed.
+        :returns: A dictionary with following keys:
+            - `documents`: List of Documents, each with an `embedding` field containing the computed embedding.
+            - `meta`: A dictionary with metadata including the model name and usage statistics.
+        :raises TypeError: If the input is not a list of Documents.
+        """
+        self._validate_input(documents)
+
+        texts_to_embed = self._prepare_texts_to_embed(documents=documents)
+        parameters = self._prepare_parameters()
+        embeddings, metadata = await self._embed_batch_async(
+            texts_to_embed=texts_to_embed, batch_size=self.batch_size, parameters=parameters
+        )
+
+        return self._build_result(documents, embeddings, metadata)
Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ classifiers = [`
`26`	`26`	`"Programming Language :: Python :: Implementation :: CPython",`
`27`	`27`	`"Programming Language :: Python :: Implementation :: PyPy",`
`28`	`28`	`]`
`29`		`-dependencies = ["requests>=2.25.0", "haystack-ai>=2.22.0"]`
	`29`	`+dependencies = ["httpx>=0.28.0", "haystack-ai>=2.22.0"]`
`30`	`30`
`31`	`31`	`[project.urls]`
`32`	`32`	`Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/jina#readme"`