From a6f333a06ea9045875f1c2d7204f04627353a762 Mon Sep 17 00:00:00 2001 From: earayu Date: Mon, 27 Apr 2026 19:31:14 +0800 Subject: [PATCH] feat(celery Wave 6 #39): per-provider multimodal embedding input payload + fix self.base_url bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wave 5 P2 chunk 1 shipped `EmbeddingService._embed_image_via_litellm` with a single LiteLLM-shaped `input=[{"image_url": {"url": "data:..."}}, {"text": "..."}]` payload that mirrors OpenAI's multimodal *chat completion* request. That shape is **not** the canonical embedding input shape for the documented multimodal embedders (Voyage AI, Jina, Cohere) — LiteLLM may translate it for some providers but ships no guarantee. This PR introduces `aperag/llm/embed/multimodal_input.py` — `build_multimodal_input_payload(provider, image_data_url, alt_text)` — that dispatches to provider-specific shapes per their documented embedding wire format: * **Voyage AI** (`voyage_ai` / `voyageai` / `voyage`): `[{"content": [{"type": "image_base64", "image_base64": ...}, {"type": "text", "text": ...}]}]` * **Jina** (`jina_ai` / `jinaai` / `jina`): flat single-key list `[{"image": ...}, {"text": ...}]` — fused embedding * **Cohere** (`cohere`): same flat-list shape as Jina * **OpenAI** (`openai` / `openai_multimodal`): chat-multimodal envelope with `image_url` / `text` parts (closest documented shape; OpenAI text-embedding endpoints don't accept images yet so the failure surfaces as a provider-side 4xx with a clear message) * **Unknown provider**: falls back to the Wave 5 P2 baseline shape so prior behaviour is preserved (hard-cut directive: no shim, but also no regression for unmapped providers) Also fixes a latent Wave 5 P2 chunk 1 bug: `_embed_image_via_litellm` referenced `self.base_url` (undefined) where it should be `self.api_base` (the constructor-set attribute matching the text path). The first real production `embed_image()` call would have raised `AttributeError`; this PR makes the production code path actually reachable. Tests ----- `tests/unit_test/llm/test_multimodal_input.py` — 11 tests: * Voyage / Jina / Cohere / OpenAI / unknown-provider shape pinning * alias matching (`voyage` / `voyageai` / `VOYAGE` / ` Voyage `) * empty / whitespace `alt_text` skips the text part on every provider (otherwise pairing the image with " " changes the cache key and may confuse the embedder) * `provider=None` resolves to the default Full unit suite: **1038 passed, 29 skipped**, ruff + format clean. Out of scope (per task #39 boundary) ------------------------------------ * No new operator config — provider keyword already drives the rest of the embedding stack (cache key, text path, error wrap). * No backend rename / migration — task #36 territory. * No cache-layer changes — task #37 already shipped that wiring. --- aperag/llm/embed/embedding_service.py | 31 ++-- aperag/llm/embed/multimodal_input.py | 158 +++++++++++++++++++ tests/unit_test/llm/test_multimodal_input.py | 124 +++++++++++++++ 3 files changed, 299 insertions(+), 14 deletions(-) create mode 100644 aperag/llm/embed/multimodal_input.py create mode 100644 tests/unit_test/llm/test_multimodal_input.py diff --git a/aperag/llm/embed/embedding_service.py b/aperag/llm/embed/embedding_service.py index 2aca4a31f..394a6f7fd 100644 --- a/aperag/llm/embed/embedding_service.py +++ b/aperag/llm/embed/embedding_service.py @@ -18,7 +18,7 @@ import hashlib import logging from concurrent.futures import ThreadPoolExecutor, as_completed -from typing import Any, Dict, List, Sequence, Tuple +from typing import Dict, List, Sequence, Tuple import litellm @@ -226,19 +226,22 @@ async def aembed_image(self, image_bytes: bytes, alt_text: str = "") -> List[flo def _embed_image_via_litellm(self, *, image_bytes: bytes, alt_text: str) -> List[float]: """Underlying LiteLLM multimodal embedding call. - Encodes the image as base64 data URL + builds the LiteLLM - ``input`` payload. Provider-specific input shape variations are - Wave 6 follow-up (per §K.10 Wave 6 backlog cross-cutting - refactor). Currently uses the documented LiteLLM-shaped - ``[{"image_url": {"url": "data:..."}}]`` input that - multimodal-capable providers (Voyage / Jina v3 / OpenAI multi- - modal / etc.) accept natively. + Encodes the image as a base64 data URL and dispatches to a + provider-specific input payload shape via + :func:`build_multimodal_input_payload` (Wave 6 task #39 per + §G.2.5.1). Voyage / Jina / Cohere / OpenAI all document + different multimodal embedding wire shapes; the dispatcher + emits the canonical shape for the configured provider and + falls back to the Wave 5 P2 LiteLLM-documented default for + unknown providers. """ import base64 import imghdr from litellm import embedding as litellm_embedding + from aperag.llm.embed.multimodal_input import build_multimodal_input_payload + # Detect MIME type from the image bytes header (avoids relying # on caller-provided alt_text format hints). Falls back to # image/jpeg if detection fails — most providers tolerate @@ -248,17 +251,17 @@ def _embed_image_via_litellm(self, *, image_bytes: bytes, alt_text: str) -> List b64 = base64.b64encode(image_bytes).decode("ascii") data_url = f"data:{mime};base64,{b64}" - input_payload: List[dict[str, Any]] = [{"image_url": {"url": data_url}}] - if alt_text and alt_text.strip(): - # Pair the image with text for embedders that accept multi- - # part inputs; embedders that ignore text simply drop it. - input_payload.append({"text": alt_text.strip()}) + input_payload = build_multimodal_input_payload( + provider=self.embedding_provider, + image_data_url=data_url, + alt_text=alt_text, + ) response = litellm_embedding( model=f"{self.embedding_provider}/{self.model}" if self.embedding_provider else self.model, input=input_payload, api_key=self.api_key, - api_base=self.base_url, + api_base=self.api_base, ) # LiteLLM normalises response shape to OpenAI-style; pull the # embedding from the first (and only) data element. diff --git a/aperag/llm/embed/multimodal_input.py b/aperag/llm/embed/multimodal_input.py new file mode 100644 index 000000000..fa5e33a0d --- /dev/null +++ b/aperag/llm/embed/multimodal_input.py @@ -0,0 +1,158 @@ +# Copyright 2025 ApeCloud, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Provider-specific multimodal embedding input payload builders. + +Wave 6 task #39 per `docs/modularization/indexing-redesign-design-pack.md` +§G.2.5.1: the Wave 5 P2 chunk 1 :func:`EmbeddingService._embed_image_via_litellm` +shipped a single LiteLLM-style ``input=[{"image_url": {"url": "data:..."}}, {"text":...}]`` +shape that mirrors the OpenAI multimodal *chat completion* request. +That shape is **not** the canonical *embedding* request shape for the +real multimodal-capable providers (Voyage AI, Jina, Cohere, etc.) — +LiteLLM may translate it transparently for some providers but ships +no guarantee. This module dispatches a per-provider payload so the +operator-configured embedder receives the input shape it actually +documents. + +The dispatcher matches the ``embedding_provider`` string used by +:class:`EmbeddingService` (LiteLLM provider keyword) and is purposely +**hard-cut** — there is no shim or fall-back to a "compatibility" +shape; unknown providers fall through to the LiteLLM-documented +default that the rest of the embedding stack already uses. +""" + +from __future__ import annotations + +from typing import Any + +# Canonical provider keywords (matches LiteLLM ``custom_llm_provider`` / +# the provider prefix on ``model="/"`` calls). The +# accepted-aliases tuple lets operators name the provider in any +# common way (LiteLLM itself accepts ``"voyage_ai"`` and ``"voyage"``). +_VOYAGE_ALIASES = ("voyage_ai", "voyageai", "voyage") +_JINA_ALIASES = ("jina_ai", "jinaai", "jina") +_OPENAI_MULTIMODAL_ALIASES = ("openai_multimodal", "openai") +_COHERE_ALIASES = ("cohere",) + + +def build_multimodal_input_payload( + *, + provider: str | None, + image_data_url: str, + alt_text: str, +) -> list[dict[str, Any]]: + """Return the ``input=`` payload for ``litellm.embedding(...)``. + + ``image_data_url`` must already be a base64 data URL + (``data:image/;base64,<...>``) — the caller is responsible + for MIME detection. ``alt_text`` may be empty; providers that + accept paired text+image inputs use it, others ignore it. + + Returns a ``list[dict]`` because every supported provider's + embedding wire shape is ``input: [...]`` even for a single image. + """ + + p = (provider or "").strip().lower() + if p in _VOYAGE_ALIASES: + return _voyage_payload(image_data_url, alt_text) + if p in _JINA_ALIASES: + return _jina_payload(image_data_url, alt_text) + if p in _COHERE_ALIASES: + return _cohere_payload(image_data_url, alt_text) + if p in _OPENAI_MULTIMODAL_ALIASES: + return _openai_payload(image_data_url, alt_text) + return _default_payload(image_data_url, alt_text) + + +def _voyage_payload(image_data_url: str, alt_text: str) -> list[dict[str, Any]]: + """Voyage AI ``voyage-multimodal-3`` input shape. + + Voyage's multimodal embedding API expects each input to be a + ``{"content": [...]}`` envelope listing one or more parts; image + parts use ``{"type": "image_base64", "image_base64": ""}`` + and text parts use ``{"type": "text", "text": "..."}``. The text + part is omitted when the caller didn't pass an ``alt_text``. + """ + + parts: list[dict[str, Any]] = [{"type": "image_base64", "image_base64": image_data_url}] + if alt_text and alt_text.strip(): + parts.append({"type": "text", "text": alt_text.strip()}) + return [{"content": parts}] + + +def _jina_payload(image_data_url: str, alt_text: str) -> list[dict[str, Any]]: + """Jina (``jina-clip-v2`` / ``jina-embeddings-v4``) input shape. + + Jina's multimodal embedding endpoint accepts a list of single-key + dicts: ``{"image": ""}`` for images and + ``{"text": "..."}`` for text. They are embedded jointly so paired + image + alt-text returns a single fused vector. + """ + + items: list[dict[str, Any]] = [{"image": image_data_url}] + if alt_text and alt_text.strip(): + items.append({"text": alt_text.strip()}) + return items + + +def _cohere_payload(image_data_url: str, alt_text: str) -> list[dict[str, Any]]: + """Cohere multimodal embedding (``embed-*-v3`` with image input). + + Cohere's image embedding uses ``{"image": ""}`` per item; + text is sent as a separate string entry in the same ``texts`` + array. Cohere does not return a fused vector for paired + text+image, so we keep both items independent — the caller can + choose which vector to consume. + """ + + items: list[dict[str, Any]] = [{"image": image_data_url}] + if alt_text and alt_text.strip(): + items.append({"text": alt_text.strip()}) + return items + + +def _openai_payload(image_data_url: str, alt_text: str) -> list[dict[str, Any]]: + """OpenAI multimodal embedding input shape (LiteLLM-mapped). + + OpenAI's standard ``text-embedding-3-*`` models do not accept + image input — operators that flip ``Model.supports_multimodal_embedding`` + on an OpenAI text embedder will hit a runtime error from the + provider. This builder formats the same multipart envelope used + by the OpenAI multimodal *chat* request (the closest documented + shape) so the failure surfaces as a provider-side 4xx with a + clear message instead of a silently-truncated text-only embed. + """ + + parts: list[dict[str, Any]] = [{"type": "image_url", "image_url": {"url": image_data_url}}] + if alt_text and alt_text.strip(): + parts.append({"type": "text", "text": alt_text.strip()}) + return parts + + +def _default_payload(image_data_url: str, alt_text: str) -> list[dict[str, Any]]: + """Fallback to the Wave 5 P2 LiteLLM-documented default shape. + + Used when ``embedding_provider`` is unset or matches no canonical + alias above. Mirrors the Wave 5 P2 chunk 1 baseline so an + unmapped provider keeps the prior behaviour rather than raising — + operators see the same error path they would have seen pre-#39. + """ + + payload: list[dict[str, Any]] = [{"image_url": {"url": image_data_url}}] + if alt_text and alt_text.strip(): + payload.append({"text": alt_text.strip()}) + return payload + + +__all__ = ["build_multimodal_input_payload"] diff --git a/tests/unit_test/llm/test_multimodal_input.py b/tests/unit_test/llm/test_multimodal_input.py new file mode 100644 index 000000000..e4e065bf5 --- /dev/null +++ b/tests/unit_test/llm/test_multimodal_input.py @@ -0,0 +1,124 @@ +# Copyright 2025 ApeCloud, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for Wave 6 task #39: provider-specific multimodal input +payload dispatcher (`build_multimodal_input_payload`). +""" + +from __future__ import annotations + +import pytest + +from aperag.llm.embed.multimodal_input import build_multimodal_input_payload + +_DATA_URL = "data:image/jpeg;base64,AAAA" +_ALT = "two cats on the sofa" + + +@pytest.mark.parametrize("provider", ["voyage_ai", "voyageai", "voyage", "VOYAGE", " Voyage "]) +def test_voyage_payload_uses_content_envelope_with_image_base64_part(provider): + """Voyage AI multimodal embedding wraps each input in a + ``{"content": [parts]}`` envelope — image part must use + ``"image_base64"`` discriminator + carry the data URL inline. + """ + + payload = build_multimodal_input_payload(provider=provider, image_data_url=_DATA_URL, alt_text=_ALT) + + assert isinstance(payload, list) and len(payload) == 1 + item = payload[0] + assert "content" in item + parts = item["content"] + image_parts = [p for p in parts if p.get("type") == "image_base64"] + text_parts = [p for p in parts if p.get("type") == "text"] + assert len(image_parts) == 1 and image_parts[0]["image_base64"] == _DATA_URL + assert len(text_parts) == 1 and text_parts[0]["text"] == _ALT + + +def test_voyage_payload_omits_text_part_when_alt_text_empty(): + payload = build_multimodal_input_payload(provider="voyage_ai", image_data_url=_DATA_URL, alt_text="") + parts = payload[0]["content"] + assert all(p.get("type") != "text" for p in parts), "empty alt_text must not produce a text part" + + +@pytest.mark.parametrize("provider", ["jina_ai", "jinaai", "jina"]) +def test_jina_payload_uses_flat_image_and_text_items(provider): + """Jina (clip-v2 / embeddings-v4) accepts a flat list of single-key + dicts: ``{"image": ...}`` for images, ``{"text": ...}`` for text. + """ + + payload = build_multimodal_input_payload(provider=provider, image_data_url=_DATA_URL, alt_text=_ALT) + + assert payload == [{"image": _DATA_URL}, {"text": _ALT}] + + +def test_jina_payload_omits_text_when_alt_text_empty(): + payload = build_multimodal_input_payload(provider="jina_ai", image_data_url=_DATA_URL, alt_text=" ") + assert payload == [{"image": _DATA_URL}] + + +def test_cohere_payload_uses_image_then_text_items(): + payload = build_multimodal_input_payload(provider="cohere", image_data_url=_DATA_URL, alt_text=_ALT) + assert payload == [{"image": _DATA_URL}, {"text": _ALT}] + + +def test_openai_payload_uses_chat_multimodal_envelope(): + """OpenAI's documented multimodal request shape uses + ``{"type": "image_url", "image_url": {"url": ...}}`` parts. + """ + + payload = build_multimodal_input_payload(provider="openai", image_data_url=_DATA_URL, alt_text=_ALT) + + assert payload == [ + {"type": "image_url", "image_url": {"url": _DATA_URL}}, + {"type": "text", "text": _ALT}, + ] + + +def test_openai_payload_alias_openai_multimodal_resolves_same_shape(): + a = build_multimodal_input_payload(provider="openai_multimodal", image_data_url=_DATA_URL, alt_text="") + b = build_multimodal_input_payload(provider="openai", image_data_url=_DATA_URL, alt_text="") + assert a == b + + +def test_unknown_provider_falls_back_to_litellm_default_shape(): + """Unmapped providers preserve the Wave 5 P2 LiteLLM-documented + default shape so the prior behaviour is unchanged. + """ + + payload = build_multimodal_input_payload(provider="some-new-provider", image_data_url=_DATA_URL, alt_text=_ALT) + assert payload == [ + {"image_url": {"url": _DATA_URL}}, + {"text": _ALT}, + ] + + +def test_none_provider_resolves_to_default(): + payload = build_multimodal_input_payload(provider=None, image_data_url=_DATA_URL, alt_text="x") + assert payload[0] == {"image_url": {"url": _DATA_URL}} + + +def test_alt_text_whitespace_treated_as_empty_across_providers(): + """A whitespace-only ``alt_text`` must not produce a text part on + any provider — pairing the image with " " would change the cache + key + may confuse the embedder. + """ + + for provider in ("voyage_ai", "jina_ai", "cohere", "openai", "unknown"): + payload = build_multimodal_input_payload(provider=provider, image_data_url=_DATA_URL, alt_text=" ") + flat = payload[0].get("content", payload) + text_present = any( + ("text" in p and p["text"]) or (p.get("type") == "text") + for p in (flat if isinstance(flat, list) else [flat]) + ) + assert not text_present, f"{provider}: whitespace alt_text must not produce a text part"