Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions backend/app/schemas/kind.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@ class EmbeddingConfig(BaseModel):
encoding_format: Optional[str] = Field(
"float", description="Encoding format (float, base64)"
)
additional_input_modalities: Optional[List[str]] = Field(
None,
description="Additional supported input modalities beyond the implicit "
"text default (e.g., ['image']). Do not include 'text' in this list.",
)


class RerankConfig(BaseModel):
Expand Down
35 changes: 35 additions & 0 deletions backend/app/services/rag/embedding/capabilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# SPDX-FileCopyrightText: 2026 Weibo, Inc.
#
# SPDX-License-Identifier: Apache-2.0

"""Capability helpers for embedding model inputs."""

from __future__ import annotations

from typing import Iterable

SUPPORTED_ADDITIONAL_INPUT_MODALITIES = {"image"}


def normalize_additional_input_modalities(
modalities: Iterable[str] | None,
) -> list[str]:
"""Normalize additional input modalities with text-only fallback."""
if not modalities:
return []

normalized: list[str] = []
for modality in modalities:
if not isinstance(modality, str):
continue
value = modality.strip().lower()
if not value or value not in SUPPORTED_ADDITIONAL_INPUT_MODALITIES:
continue
if value not in normalized:
normalized.append(value)
return normalized


def embedding_supports_image_input(modalities: Iterable[str] | None) -> bool:
"""Return whether the config explicitly declares image input support."""
return "image" in normalize_additional_input_modalities(modalities)
17 changes: 17 additions & 0 deletions backend/app/services/rag/embedding/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
from app.services.chat.config.model_resolver import (
build_default_headers_with_placeholders,
)
from app.services.rag.embedding.capabilities import (
embedding_supports_image_input,
normalize_additional_input_modalities,
)
from app.services.rag.embedding.custom import CustomEmbedding
from shared.utils.crypto import decrypt_api_key

Expand Down Expand Up @@ -171,10 +175,23 @@ def create_embedding_model_from_crd(
# This is used by Milvus to create collections with the correct dimension
embedding_config = spec.get("embeddingConfig", {})
dimensions = embedding_config.get("dimensions") if embedding_config else None
additional_input_modalities = normalize_additional_input_modalities(
embedding_config.get("additional_input_modalities")
if embedding_config
else None
)
if dimensions:
logger.info(
f"[EmbeddingFactory] Model '{model_name}' has configured dimensions: {dimensions}"
)
if additional_input_modalities:
logger.info(
"[EmbeddingFactory] Model '%s' additional input modalities: %s "
"(supports_image_input=%s)",
model_name,
additional_input_modalities,
embedding_supports_image_input(additional_input_modalities),
)

# Build embedding config based on protocol
if protocol == "openai":
Expand Down
26 changes: 26 additions & 0 deletions backend/app/services/rag/index/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from llama_index.core import Document, SimpleDirectoryReader

from app.schemas.rag import SplitterConfig
from app.services.rag.preprocess import sanitize_text_for_indexing
from app.services.rag.splitter import SemanticSplitter, SentenceSplitter, SmartSplitter
from app.services.rag.splitter.factory import create_splitter
from app.services.rag.storage.base import BaseStorageBackend
Expand Down Expand Up @@ -68,6 +69,29 @@ def sanitize_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
return sanitized


def sanitize_documents(documents: List[Document]) -> List[Document]:
"""Sanitize document text before chunking."""
sanitized_documents: List[Document] = []
for doc in documents:
result = sanitize_text_for_indexing(doc.text)
if result.replacements_count > 0:
add_span_event(
"rag.indexer.documents.sanitized",
{
"replacements_count": str(result.replacements_count),
"replacement_summary": str(result.replacement_summary),
},
)

payload = doc.model_dump()
payload["text"] = result.text
if payload.get("text_resource"):
payload["text_resource"]["text"] = result.text
sanitized_documents.append(Document(**payload))

return sanitized_documents


class DocumentIndexer:
"""Orchestrates document indexing process."""

Expand Down Expand Up @@ -215,6 +239,8 @@ def _index_documents(
for doc in documents:
doc.metadata = sanitize_metadata(doc.metadata)

documents = sanitize_documents(documents)

# Split documents into nodes
nodes = self.splitter.split_documents(documents)

Expand Down
9 changes: 9 additions & 0 deletions backend/app/services/rag/preprocess/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# SPDX-FileCopyrightText: 2026 Weibo, Inc.
#
# SPDX-License-Identifier: Apache-2.0

"""Text preprocessing utilities for RAG indexing."""

from .text_sanitizer import SanitizedTextResult, sanitize_text_for_indexing

__all__ = ["SanitizedTextResult", "sanitize_text_for_indexing"]
79 changes: 79 additions & 0 deletions backend/app/services/rag/preprocess/text_sanitizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# SPDX-FileCopyrightText: 2026 Weibo, Inc.
#
# SPDX-License-Identifier: Apache-2.0

"""Sanitize inline binary payloads before text chunking and embedding."""

from __future__ import annotations

import re
from collections import Counter
from dataclasses import dataclass

MARKDOWN_IMAGE_DATA_URL_PATTERN = re.compile(
r"!\[(?P<alt>[^\]]*)\]\(\s*(?P<url>data:image/[^)\s]+;base64,[A-Za-z0-9+/=]+)\s*\)",
re.IGNORECASE,
)
DATA_URL_PATTERN = re.compile(
r"data:(?P<mime>[-\w.+/]+)(?:;[-\w=]+)*;base64,(?P<data>[A-Za-z0-9+/=]+)",
re.IGNORECASE,
)
BARE_BASE64_PATTERN = re.compile(
r"(?<![A-Za-z0-9+/=])(?:[A-Za-z0-9+/=]{128,})(?![A-Za-z0-9+/=])"
)


@dataclass(frozen=True)
class SanitizedTextResult:
"""Sanitized text plus replacement accounting."""

text: str
replacements_count: int
replacement_summary: dict[str, int]


def sanitize_text_for_indexing(text: str) -> SanitizedTextResult:
"""Replace inline binary payloads with readable placeholders."""
if not text:
return SanitizedTextResult(
text=text or "",
replacements_count=0,
replacement_summary={},
)

counters: Counter[str] = Counter()
sanitized_text = text

def replace_markdown_image(match: re.Match[str]) -> str:
counters["inline_image"] += 1
alt = match.group("alt").strip()
if alt:
return f"![{alt}]([inline image omitted])"
return "[inline image omitted]"

def replace_data_url(match: re.Match[str]) -> str:
mime_type = match.group("mime").lower()
if mime_type.startswith("image/"):
counters["inline_image"] += 1
return "[inline image omitted]"
if mime_type == "application/pdf":
counters["inline_pdf"] += 1
return "[inline pdf omitted]"
counters["embedded_binary"] += 1
return "[embedded binary content omitted]"

def replace_bare_base64(_: re.Match[str]) -> str:
counters["bare_base64"] += 1
return "[base64 content omitted]"

sanitized_text = MARKDOWN_IMAGE_DATA_URL_PATTERN.sub(
replace_markdown_image, sanitized_text
)
sanitized_text = DATA_URL_PATTERN.sub(replace_data_url, sanitized_text)
sanitized_text = BARE_BASE64_PATTERN.sub(replace_bare_base64, sanitized_text)

return SanitizedTextResult(
text=sanitized_text,
replacements_count=sum(counters.values()),
replacement_summary=dict(counters),
)
52 changes: 52 additions & 0 deletions backend/tests/services/rag/test_embedding_capabilities.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# SPDX-FileCopyrightText: 2026 Weibo, Inc.
#
# SPDX-License-Identifier: Apache-2.0

import pytest

from app.schemas.kind import EmbeddingConfig
from app.services.rag.embedding.capabilities import (
embedding_supports_image_input,
normalize_additional_input_modalities,
)


@pytest.mark.unit
class TestEmbeddingCapabilities:
def test_embedding_config_without_modalities_defaults_to_text_only(self) -> None:
config = EmbeddingConfig(dimensions=1536)

assert (
normalize_additional_input_modalities(config.additional_input_modalities)
== []
)
assert (
embedding_supports_image_input(config.additional_input_modalities) is False
)

def test_embedding_config_with_empty_modalities_stays_text_only(self) -> None:
config = EmbeddingConfig(
dimensions=1536,
additional_input_modalities=[],
)

assert (
normalize_additional_input_modalities(config.additional_input_modalities)
== []
)
assert (
embedding_supports_image_input(config.additional_input_modalities) is False
)

def test_embedding_config_with_image_modality_is_detected(self) -> None:
config = EmbeddingConfig(
dimensions=1536,
additional_input_modalities=["image"],
)

assert normalize_additional_input_modalities(
config.additional_input_modalities
) == ["image"]
assert (
embedding_supports_image_input(config.additional_input_modalities) is True
)
Loading
Loading