Skip to content
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ def __init__(
task: str | None = None,
dimensions: int | None = None,
late_chunking: bool | None = None,
*,
base_url: str = JINA_API_URL,
) -> None:
"""
Create a JinaDocumentEmbedder component.
Expand All @@ -72,13 +74,15 @@ def __init__(
:param late_chunking: A boolean to enable or disable late chunking.
Apply the late chunking technique to leverage the model's long-context capabilities for
generating contextual chunk embeddings.
:param base_url: The base URL of the Jina API.

The support of `task` and `late_chunking` parameters is only available for jina-embeddings-v3.
"""
resolved_api_key = api_key.resolve_value()

self.api_key = api_key
self.model_name = model
self.base_url = base_url
self.prefix = prefix
self.suffix = suffix
self.batch_size = batch_size
Expand Down Expand Up @@ -113,6 +117,7 @@ def to_dict(self) -> dict[str, Any]:
kwargs = {
"api_key": self.api_key.to_dict(),
"model": self.model_name,
"base_url": self.base_url,
"prefix": self.prefix,
"suffix": self.suffix,
"batch_size": self.batch_size,
Expand Down Expand Up @@ -173,7 +178,7 @@ def _embed_batch(
):
batch = texts_to_embed[i : i + batch_size]
response = self._session.post(
JINA_API_URL,
self.base_url,
json={"input": batch, "model": self.model_name, **(parameters or {})},
).json()
if "data" not in response:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def __init__(
*,
api_key: Secret = Secret.from_env_var("JINA_API_KEY"), # noqa: B008
model: str = "jina-clip-v2",
base_url: str = JINA_API_URL,
Comment thread
srini047 marked this conversation as resolved.
file_path_meta_field: str = "file_path",
root_path: str | None = None,
embedding_dimension: int | None = None,
Expand All @@ -74,6 +75,7 @@ def __init__(
- "jina-clip-v2" (default)
- "jina-embeddings-v4"
Check the list of available models on [Jina documentation](https://jina.ai/embeddings/).
:param base_url: The base URL of the Jina API.
:param file_path_meta_field: The metadata field in the Document that contains the file path to the image or PDF.
:param root_path: The root directory path where document files are located. If provided, file paths in
document metadata will be resolved relative to this path. If None, file paths are treated as absolute paths.
Expand All @@ -89,6 +91,7 @@ def __init__(

self.api_key = api_key
self.model_name = model
self.base_url = base_url
self.file_path_meta_field = file_path_meta_field
self.root_path = root_path or ""
self.embedding_dimension = embedding_dimension
Expand Down Expand Up @@ -120,6 +123,7 @@ def to_dict(self) -> dict[str, Any]:
self,
api_key=self.api_key.to_dict(),
model=self.model_name,
base_url=self.base_url,
file_path_meta_field=self.file_path_meta_field,
root_path=self.root_path,
embedding_dimension=self.embedding_dimension,
Expand Down Expand Up @@ -235,7 +239,7 @@ def run(self, documents: list[Document]) -> dict[str, list[Document]]:

try:
response = self._session.post(
JINA_API_URL,
self.base_url,
json={
"input": batch_images,
"model": self.model_name,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ def __init__(
task: str | None = None,
dimensions: int | None = None,
late_chunking: bool | None = None,
*,
base_url: str = JINA_API_URL,
) -> None:
"""
Create a JinaTextEmbedder component.
Expand All @@ -60,6 +62,7 @@ def __init__(
:param late_chunking: A boolean to enable or disable late chunking.
Apply the late chunking technique to leverage the model's long-context capabilities for
generating contextual chunk embeddings.
:param base_url: The base URL of the Jina API.

The support of `task` and `late_chunking` parameters is only available for jina-embeddings-v3.
"""
Expand All @@ -68,6 +71,7 @@ def __init__(

self.api_key = api_key
self.model_name = model
self.base_url = base_url
self.prefix = prefix
self.suffix = suffix
self._session = requests.Session()
Expand Down Expand Up @@ -98,6 +102,7 @@ def to_dict(self) -> dict[str, Any]:
kwargs: dict[str, Any] = {
"api_key": self.api_key.to_dict(),
"model": self.model_name,
"base_url": self.base_url,
"prefix": self.prefix,
"suffix": self.suffix,
}
Expand Down Expand Up @@ -152,7 +157,7 @@ def run(self, text: str) -> dict[str, Any]:
parameters["late_chunking"] = self.late_chunking

resp = self._session.post(
JINA_API_URL,
self.base_url,
json={"input": [text_to_embed], "model": self.model_name, **parameters},
).json()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def __init__(
api_key: Secret = Secret.from_env_var("JINA_API_KEY"), # noqa: B008,
top_k: int | None = None,
score_threshold: float | None = None,
*,
base_url: str = JINA_API_URL,
) -> None:
"""
Creates an instance of JinaRanker.
Expand All @@ -47,6 +49,7 @@ def __init__(
The maximum number of Documents to return per query. If `None`, all documents are returned
:param score_threshold:
If provided only returns documents with a score above this threshold.
:param base_url: The base URL of the Jina API.

:raises ValueError:
If `top_k` is not > 0.
Expand All @@ -57,6 +60,7 @@ def __init__(
self.model = model
self.top_k = top_k
self.score_threshold = score_threshold
self.base_url = base_url

if self.top_k is not None and self.top_k <= 0:
msg = f"top_k must be > 0, but got {top_k}"
Expand All @@ -82,6 +86,7 @@ def to_dict(self) -> dict[str, Any]:
self,
api_key=self.api_key.to_dict(),
model=self.model,
base_url=self.base_url,
top_k=self.top_k,
score_threshold=self.score_threshold,
)
Expand Down Expand Up @@ -149,7 +154,7 @@ def run(
}

resp = self._session.post(
JINA_API_URL,
self.base_url,
json=data,
).json()

Expand Down
6 changes: 6 additions & 0 deletions integrations/jina/tests/test_document_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def test_init_default(self, monkeypatch):

assert embedder.api_key == Secret.from_env_var("JINA_API_KEY")
assert embedder.model_name == "jina-embeddings-v3"
assert embedder.base_url == "https://api.jina.ai/v1/embeddings"
assert embedder.prefix == ""
assert embedder.suffix == ""
assert embedder.batch_size == 32
Expand All @@ -44,6 +45,7 @@ def test_init_with_parameters(self):
embedder = JinaDocumentEmbedder(
api_key=Secret.from_token("fake-api-key"),
model="model",
base_url="https://my.custom.url/v1/embeddings",
prefix="prefix",
suffix="suffix",
batch_size=64,
Expand All @@ -57,6 +59,7 @@ def test_init_with_parameters(self):

assert embedder.api_key == Secret.from_token("fake-api-key")
assert embedder.model_name == "model"
assert embedder.base_url == "https://my.custom.url/v1/embeddings"
assert embedder.prefix == "prefix"
assert embedder.suffix == "suffix"
assert embedder.batch_size == 64
Expand All @@ -81,6 +84,7 @@ def test_to_dict(self, monkeypatch):
"init_parameters": {
"api_key": {"env_vars": ["JINA_API_KEY"], "strict": True, "type": "env_var"},
"model": "jina-embeddings-v3",
"base_url": "https://api.jina.ai/v1/embeddings",
"prefix": "",
"suffix": "",
"batch_size": 32,
Expand All @@ -94,6 +98,7 @@ def test_to_dict_with_custom_init_parameters(self, monkeypatch):
monkeypatch.setenv("JINA_API_KEY", "fake-api-key")
component = JinaDocumentEmbedder(
model="model",
base_url="https://my.custom.url/v1/embeddings",
prefix="prefix",
suffix="suffix",
batch_size=64,
Expand All @@ -109,6 +114,7 @@ def test_to_dict_with_custom_init_parameters(self, monkeypatch):
"init_parameters": {
"api_key": {"env_vars": ["JINA_API_KEY"], "strict": True, "type": "env_var"},
"model": "model",
"base_url": "https://my.custom.url/v1/embeddings",
"prefix": "prefix",
"suffix": "suffix",
"batch_size": 64,
Expand Down
7 changes: 7 additions & 0 deletions integrations/jina/tests/test_document_image_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def test_init_default(self, monkeypatch):
monkeypatch.setenv("JINA_API_KEY", "fake-api-key")
embedder = JinaDocumentImageEmbedder()
assert embedder.model_name == "jina-clip-v2"
assert embedder.base_url == "https://api.jina.ai/v1/embeddings"
assert embedder.file_path_meta_field == "file_path"
assert embedder.root_path == ""
assert embedder.embedding_dimension is None
Expand All @@ -31,13 +32,15 @@ def test_init_with_parameters(self):
embedder = JinaDocumentImageEmbedder(
api_key=Secret.from_token("fake-api-token"),
model="jina-embeddings-v4",
base_url="https://my.custom.url/v1/embeddings",
file_path_meta_field="custom_file_path",
root_path="/custom/root",
embedding_dimension=256,
image_size=(512, 512),
batch_size=5,
)
assert embedder.model_name == "jina-embeddings-v4"
assert embedder.base_url == "https://my.custom.url/v1/embeddings"
assert embedder.file_path_meta_field == "custom_file_path"
assert embedder.root_path == "/custom/root"
assert embedder.embedding_dimension == 256
Expand All @@ -50,6 +53,7 @@ def test_to_dict(self, monkeypatch):
component = JinaDocumentImageEmbedder(
api_key=Secret.from_env_var("JINA_API_KEY"),
model="jina-clip-v2",
base_url="https://api.jina.ai/v1/embeddings",
file_path_meta_field="image_path",
root_path="/images",
embedding_dimension=512,
Expand All @@ -62,6 +66,7 @@ def test_to_dict(self, monkeypatch):
"init_parameters": {
"api_key": {"env_vars": ["JINA_API_KEY"], "strict": True, "type": "env_var"},
"model": "jina-clip-v2",
"base_url": "https://api.jina.ai/v1/embeddings",
"file_path_meta_field": "image_path",
"root_path": "/images",
"embedding_dimension": 512,
Expand All @@ -78,6 +83,7 @@ def test_from_dict(self, monkeypatch):
"init_parameters": {
"api_key": {"env_vars": ["JINA_API_KEY"], "strict": True, "type": "env_var"},
"model": "jina-clip-v2",
"base_url": "https://api.jina.ai/v1/embeddings",
"file_path_meta_field": "image_path",
"root_path": "/images",
"embedding_dimension": 512,
Expand All @@ -87,6 +93,7 @@ def test_from_dict(self, monkeypatch):
}
component = JinaDocumentImageEmbedder.from_dict(data)
assert component.model_name == "jina-clip-v2"
assert component.base_url == "https://api.jina.ai/v1/embeddings"
assert component.file_path_meta_field == "image_path"
assert component.root_path == "/images"
assert component.embedding_dimension == 512
Expand Down
13 changes: 11 additions & 2 deletions integrations/jina/tests/test_ranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,17 @@ def test_init_default(self, monkeypatch):
assert embedder.model == "jina-reranker-v1-base-en"

def test_init_with_parameters(self):
embedder = JinaRanker(api_key=Secret.from_token("fake-api-key"), model="model", top_k=64, score_threshold=0.5)
embedder = JinaRanker(
api_key=Secret.from_token("fake-api-key"),
model="model",
base_url="https://my.custom.url/v1/rerank",
top_k=64,
score_threshold=0.5,
)

assert embedder.api_key == Secret.from_token("fake-api-key")
assert embedder.model == "model"
assert embedder.base_url == "https://my.custom.url/v1/rerank"
assert embedder.top_k == 64
assert embedder.score_threshold == 0.5

Expand All @@ -59,20 +66,22 @@ def test_to_dict(self, monkeypatch):
"init_parameters": {
"api_key": {"env_vars": ["JINA_API_KEY"], "strict": True, "type": "env_var"},
"model": "jina-reranker-v1-base-en",
"base_url": "https://api.jina.ai/v1/rerank",
"top_k": None,
"score_threshold": None,
},
}

def test_to_dict_with_custom_init_parameters(self, monkeypatch):
monkeypatch.setenv("JINA_API_KEY", "fake-api-key")
component = JinaRanker(model="model", top_k=64, score_threshold=0.5)
component = JinaRanker(model="model", top_k=64, score_threshold=0.5, base_url="https://my.custom.url/v1/rerank")
data = component.to_dict()
assert data == {
"type": "haystack_integrations.components.rankers.jina.ranker.JinaRanker",
"init_parameters": {
"api_key": {"env_vars": ["JINA_API_KEY"], "strict": True, "type": "env_var"},
"model": "model",
"base_url": "https://my.custom.url/v1/rerank",
"top_k": 64,
"score_threshold": 0.5,
},
Expand Down
6 changes: 6 additions & 0 deletions integrations/jina/tests/test_text_embedder.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,22 @@ def test_init_default(self, monkeypatch):

assert embedder.api_key == Secret.from_env_var("JINA_API_KEY")
assert embedder.model_name == "jina-embeddings-v3"
assert embedder.base_url == "https://api.jina.ai/v1/embeddings"
assert embedder.prefix == ""
assert embedder.suffix == ""

def test_init_with_parameters(self):
embedder = JinaTextEmbedder(
api_key=Secret.from_token("fake-api-key"),
model="model",
base_url="https://my.custom.url/v1/embeddings",
prefix="prefix",
suffix="suffix",
late_chunking=True,
)
assert embedder.api_key == Secret.from_token("fake-api-key")
assert embedder.model_name == "model"
assert embedder.base_url == "https://my.custom.url/v1/embeddings"
assert embedder.prefix == "prefix"
assert embedder.suffix == "suffix"
assert embedder.late_chunking is True
Expand All @@ -50,6 +53,7 @@ def test_to_dict(self, monkeypatch):
"init_parameters": {
"api_key": {"env_vars": ["JINA_API_KEY"], "strict": True, "type": "env_var"},
"model": "jina-embeddings-v3",
"base_url": "https://api.jina.ai/v1/embeddings",
"prefix": "",
"suffix": "",
},
Expand All @@ -59,6 +63,7 @@ def test_to_dict_with_custom_init_parameters(self, monkeypatch):
monkeypatch.setenv("JINA_API_KEY", "fake-api-key")
component = JinaTextEmbedder(
model="model",
base_url="https://my.custom.url/v1/embeddings",
prefix="prefix",
suffix="suffix",
task="retrieval.query",
Expand All @@ -70,6 +75,7 @@ def test_to_dict_with_custom_init_parameters(self, monkeypatch):
"init_parameters": {
"api_key": {"env_vars": ["JINA_API_KEY"], "strict": True, "type": "env_var"},
"model": "model",
"base_url": "https://my.custom.url/v1/embeddings",
"prefix": "prefix",
"suffix": "suffix",
"task": "retrieval.query",
Expand Down