Skip to content

Commit 1d7e82a

Browse files
sjrlanakin87
authored andcommitted
feat: Add image converters (deepset-ai#9628)
* Add image converters * Fix tests * Update haystack/components/converters/image/__init__.py Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com> --------- Co-authored-by: Stefano Fiorucci <stefanofiorucci@gmail.com>
1 parent 67e0db0 commit 1d7e82a

16 files changed

Lines changed: 1652 additions & 0 deletions
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
loaders:
2+
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
3+
search_path: [../../../haystack/components/converters/image]
4+
modules:
5+
[
6+
"document_to_image",
7+
"file_to_document",
8+
"file_to_image",
9+
"pdf_to_image",
10+
]
11+
ignore_when_discovered: ["__init__"]
12+
processors:
13+
- type: filter
14+
expression:
15+
documented_only: true
16+
do_not_filter_modules: false
17+
skip_empty_modules: true
18+
- type: smart
19+
- type: crossref
20+
renderer:
21+
type: haystack_pydoc_tools.renderers.ReadmeCoreRenderer
22+
excerpt: Various converters to transform image data from one format to another.
23+
category_slug: haystack-api
24+
title: Image Converters
25+
slug: image-converters-api
26+
order: 21
27+
markdown:
28+
descriptive_class_title: false
29+
classdef_code_block: false
30+
descriptive_module_title: true
31+
add_method_class_prefix: true
32+
add_member_class_prefix: false
33+
filename: image_converters_api.md
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
loaders:
2+
- type: haystack_pydoc_tools.loaders.CustomPythonLoader
3+
search_path: [../../../haystack/components/converters/image]
4+
modules:
5+
[
6+
"document_to_image",
7+
"file_to_document",
8+
"file_to_image",
9+
"pdf_to_image",
10+
]
11+
ignore_when_discovered: ["__init__"]
12+
processors:
13+
- type: filter
14+
expression:
15+
documented_only: true
16+
do_not_filter_modules: false
17+
skip_empty_modules: true
18+
- type: smart
19+
- type: crossref
20+
renderer:
21+
type: haystack_pydoc_tools.renderers.DocusaurusRenderer
22+
description: Various converters to transform image data from one format to another.
23+
title: Image Converters
24+
id: image-converters-api
25+
markdown:
26+
descriptive_class_title: false
27+
classdef_code_block: false
28+
descriptive_module_title: true
29+
add_method_class_prefix: true
30+
add_member_class_prefix: false
31+
filename: image_converters_api.md
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import sys
6+
from typing import TYPE_CHECKING
7+
8+
from lazy_imports import LazyImporter
9+
10+
_import_structure = {
11+
"document_to_image": ["DocumentToImageContent"],
12+
"file_to_document": ["ImageFileToDocument"],
13+
"file_to_image": ["ImageFileToImageContent"],
14+
"pdf_to_image": ["PDFToImageContent"],
15+
}
16+
17+
if TYPE_CHECKING:
18+
from .document_to_image import DocumentToImageContent
19+
from .file_to_document import ImageFileToDocument
20+
from .file_to_image import ImageFileToImageContent
21+
from .pdf_to_image import PDFToImageContent
22+
else:
23+
sys.modules[__name__] = LazyImporter(name=__name__, module_file=__file__, import_structure=_import_structure)
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
from typing import Dict, List, Literal, Optional, Tuple
6+
7+
from haystack import Document, component, logging
8+
from haystack.components.converters.image.image_utils import (
9+
_batch_convert_pdf_pages_to_images,
10+
_encode_image_to_base64,
11+
_extract_image_sources_info,
12+
_PDFPageInfo,
13+
pillow_import,
14+
pypdfium2_import,
15+
)
16+
from haystack.dataclasses import ByteStream
17+
from haystack.dataclasses.image_content import ImageContent
18+
19+
logger = logging.getLogger(__name__)
20+
21+
22+
@component
23+
class DocumentToImageContent:
24+
"""
25+
Converts documents sourced from PDF and image files into ImageContents.
26+
27+
This component processes a list of documents and extracts visual content from supported file formats, converting
28+
them into ImageContents that can be used for multimodal AI tasks. It handles both direct image files and PDF
29+
documents by extracting specific pages as images.
30+
31+
Documents are expected to have metadata containing:
32+
- The `file_path_meta_field` key with a valid file path that exists when combined with `root_path`
33+
- A supported image format (MIME type must be one of the supported image types)
34+
- For PDF files, a `page_number` key specifying which page to extract
35+
36+
### Usage example
37+
```python
38+
from haystack import Document
39+
from haystack.components.image_converters.document_to_image import DocumentToImageContent
40+
41+
converter = DocumentToImageContent(
42+
file_path_meta_field="file_path",
43+
root_path="/data/documents",
44+
detail="high",
45+
size=(800, 600)
46+
)
47+
48+
documents = [
49+
Document(content="Optional description of image.jpg", meta={"file_path": "image.jpg"}),
50+
Document(content="Text content of page 1 of doc.pdf", meta={"file_path": "doc.pdf", "page_number": 1})
51+
]
52+
53+
result = converter.run(documents)
54+
image_contents = result["image_contents"]
55+
# [ImageContent(
56+
# base64_image='/9j/4A...', mime_type='image/jpeg', detail='high', meta={'file_path': 'image.jpg'}
57+
# ),
58+
# ImageContent(
59+
# base64_image='/9j/4A...', mime_type='image/jpeg', detail='high',
60+
# meta={'page_number': 1, 'file_path': 'doc.pdf'}
61+
# )]
62+
```
63+
"""
64+
65+
def __init__(
66+
self,
67+
*,
68+
file_path_meta_field: str = "file_path",
69+
root_path: Optional[str] = None,
70+
detail: Optional[Literal["auto", "high", "low"]] = None,
71+
size: Optional[Tuple[int, int]] = None,
72+
):
73+
"""
74+
Initialize the DocumentToImageContent component.
75+
76+
:param file_path_meta_field: The metadata field in the Document that contains the file path to the image or PDF.
77+
:param root_path: The root directory path where document files are located. If provided, file paths in
78+
document metadata will be resolved relative to this path. If None, file paths are treated as absolute paths.
79+
:param detail: Optional detail level of the image (only supported by OpenAI). Can be "auto", "high", or "low".
80+
This will be passed to the created ImageContent objects.
81+
:param size: If provided, resizes the image to fit within the specified dimensions (width, height) while
82+
maintaining aspect ratio. This reduces file size, memory usage, and processing time, which is beneficial
83+
when working with models that have resolution constraints or when transmitting images to remote services.
84+
"""
85+
pillow_import.check()
86+
pypdfium2_import.check()
87+
88+
self.file_path_meta_field = file_path_meta_field
89+
self.root_path = root_path or ""
90+
self.detail = detail
91+
self.size = size
92+
93+
@component.output_types(image_contents=List[Optional[ImageContent]])
94+
def run(self, documents: List[Document]) -> Dict[str, List[Optional[ImageContent]]]:
95+
"""
96+
Convert documents with image or PDF sources into ImageContent objects.
97+
98+
This method processes the input documents, extracting images from supported file formats and converting them
99+
into ImageContent objects.
100+
101+
:param documents: A list of documents to process. Each document should have metadata containing at minimum
102+
a 'file_path_meta_field' key. PDF documents additionally require a 'page_number' key to specify which
103+
page to convert.
104+
105+
:returns:
106+
Dictionary containing one key:
107+
- "image_contents": ImageContents created from the processed documents. These contain base64-encoded image
108+
data and metadata. The order corresponds to order of input documents.
109+
:raises ValueError:
110+
If any document is missing the required metadata keys, has an invalid file path, or has an unsupported
111+
MIME type. The error message will specify which document and what information is missing or incorrect.
112+
"""
113+
if not documents:
114+
return {"image_contents": []}
115+
116+
images_source_info = _extract_image_sources_info(
117+
documents=documents, file_path_meta_field=self.file_path_meta_field, root_path=self.root_path
118+
)
119+
120+
image_contents: List[Optional[ImageContent]] = [None] * len(documents)
121+
122+
pdf_page_infos: List[_PDFPageInfo] = []
123+
124+
for doc_idx, image_source_info in enumerate(images_source_info):
125+
mime_type = image_source_info["mime_type"]
126+
path = image_source_info["path"]
127+
if mime_type == "application/pdf":
128+
# Store PDF documents for later processing
129+
page_number = image_source_info.get("page_number")
130+
assert page_number is not None # checked in _extract_image_sources_info but mypy doesn't know that
131+
pdf_page_info: _PDFPageInfo = {"doc_idx": doc_idx, "path": path, "page_number": page_number}
132+
pdf_page_infos.append(pdf_page_info)
133+
else:
134+
# Process images directly
135+
bytestream = ByteStream.from_file_path(filepath=path, mime_type=mime_type)
136+
_, base64_image = _encode_image_to_base64(bytestream=bytestream, size=self.size)
137+
image_contents[doc_idx] = ImageContent(
138+
base64_image=base64_image,
139+
mime_type=mime_type,
140+
detail=self.detail,
141+
meta={"file_path": documents[doc_idx].meta[self.file_path_meta_field]},
142+
)
143+
144+
# efficiently convert PDF pages to images: each PDF is opened and processed only once
145+
pdf_page_infos_by_doc_idx: Dict[int, _PDFPageInfo] = {
146+
pdf_page_info["doc_idx"]: pdf_page_info for pdf_page_info in pdf_page_infos
147+
}
148+
pdf_images_by_doc_idx = _batch_convert_pdf_pages_to_images(
149+
pdf_page_infos=pdf_page_infos, size=self.size, return_base64=True
150+
)
151+
for doc_idx, base64_pdf_image in pdf_images_by_doc_idx.items():
152+
meta = {
153+
"file_path": documents[doc_idx].meta[self.file_path_meta_field],
154+
"page_number": pdf_page_infos_by_doc_idx[doc_idx]["page_number"],
155+
}
156+
# we know that base64_pdf_image is a string because we set return_base64=True but mypy doesn't know that
157+
assert isinstance(base64_pdf_image, str)
158+
image_contents[doc_idx] = ImageContent(
159+
base64_image=base64_pdf_image, mime_type="image/jpeg", detail=self.detail, meta=meta
160+
)
161+
162+
none_image_contents_doc_ids = [
163+
documents[doc_idx].id for doc_idx, image_content in enumerate(image_contents) if image_content is None
164+
]
165+
if none_image_contents_doc_ids:
166+
logger.warning(
167+
"Conversion failed for some documents. Their output will be None. "
168+
f"Document IDs: {none_image_contents_doc_ids}"
169+
)
170+
171+
return {"image_contents": image_contents}
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
import os
6+
from pathlib import Path
7+
from typing import Any, Dict, List, Optional, Union
8+
9+
from haystack import Document, component, logging
10+
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
11+
from haystack.dataclasses import ByteStream
12+
13+
logger = logging.getLogger(__name__)
14+
15+
16+
@component
17+
class ImageFileToDocument:
18+
"""
19+
Converts image file references into empty Document objects with associated metadata.
20+
21+
This component is useful in pipelines where image file paths need to be wrapped in `Document` objects to be
22+
processed by downstream components such as the `SentenceTransformersImageDocumentEmbedder`.
23+
24+
It does **not** extract any content from the image files, instead it creates `Document` objects with `None` as
25+
their content and attaches metadata such as file path and any user-provided values.
26+
27+
### Usage example
28+
```python
29+
from haystack.components.converters.image import ImageFileToDocument
30+
31+
converter = ImageFileToDocument()
32+
33+
sources = ["image.jpg", "another_image.png"]
34+
35+
result = converter.run(sources=sources)
36+
documents = result["documents"]
37+
38+
print(documents)
39+
40+
# [Document(id=..., meta: {'file_path': 'image.jpg'}),
41+
# Document(id=..., meta: {'file_path': 'another_image.png'})]
42+
```
43+
"""
44+
45+
def __init__(self, *, store_full_path: bool = False):
46+
"""
47+
Initialize the ImageFileToDocument component.
48+
49+
:param store_full_path:
50+
If True, the full path of the file is stored in the metadata of the document.
51+
If False, only the file name is stored.
52+
"""
53+
self.store_full_path = store_full_path
54+
55+
@component.output_types(documents=List[Document])
56+
def run(
57+
self,
58+
*,
59+
sources: List[Union[str, Path, ByteStream]],
60+
meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
61+
) -> Dict[str, List[Document]]:
62+
"""
63+
Convert image files into empty Document objects with metadata.
64+
65+
This method accepts image file references (as file paths or ByteStreams) and creates `Document` objects
66+
without content. These documents are enriched with metadata derived from the input source and optional
67+
user-provided metadata.
68+
69+
:param sources:
70+
List of file paths or ByteStream objects to convert.
71+
:param meta:
72+
Optional metadata to attach to the documents.
73+
This value can be a list of dictionaries or a single dictionary.
74+
If it's a single dictionary, its content is added to the metadata of all produced documents.
75+
If it's a list, its length must match the number of sources, as they are zipped together.
76+
For ByteStream objects, their `meta` is added to the output documents.
77+
78+
:returns:
79+
A dictionary containing:
80+
- `documents`: A list of `Document` objects with empty content and associated metadata.
81+
"""
82+
83+
documents = []
84+
meta_list = normalize_metadata(meta, sources_count=len(sources))
85+
86+
for source, metadata in zip(sources, meta_list):
87+
try:
88+
bytestream = get_bytestream_from_source(source)
89+
except Exception as e:
90+
logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e)
91+
continue
92+
93+
merged_metadata = {**bytestream.meta, **metadata}
94+
95+
if not self.store_full_path and (file_path := bytestream.meta.get("file_path")):
96+
merged_metadata["file_path"] = os.path.basename(file_path)
97+
document = Document(content=None, meta=merged_metadata)
98+
documents.append(document)
99+
100+
return {"documents": documents}

0 commit comments

Comments
 (0)