Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion haystack/components/converters/image/image_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,7 @@ def _extract_image_sources_info(
invalid, the MIME type is not supported, or the page number is missing for a PDF document.
"""
images_source_info: list[_ImageSourceInfo] = []
resolved_root: Path | None = Path(root_path).resolve() if root_path else None
for doc in documents:
file_path = doc.meta.get(file_path_meta_field)
if file_path is None:
Expand All @@ -247,7 +248,21 @@ def _extract_image_sources_info(
f" Please ensure that the documents you are trying to convert have this key set."
)

resolved_file_path = Path(root_path, file_path)
# Document metadata is treated as untrusted: it commonly originates from upstream
# converters or user uploads. Reject absolute paths and resolve against root_path so
# ``../`` segments cannot escape the configured directory.
candidate = Path(file_path)
if candidate.is_absolute():
raise ValueError(
f"Document with ID '{doc.id}' has an absolute file path '{file_path}' in metadata; "
f"only paths relative to root_path are supported."
)
resolved_file_path = (resolved_root / candidate).resolve() if resolved_root else candidate.resolve()
if resolved_root is not None and resolved_root != resolved_file_path and resolved_root not in resolved_file_path.parents:
raise ValueError(
f"Document with ID '{doc.id}' resolves to '{resolved_file_path}', which is outside the "
f"configured root_path '{resolved_root}'."
)
if not resolved_file_path.is_file():
raise ValueError(
f"Document with ID '{doc.id}' has an invalid file path '{resolved_file_path}'. "
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
enhancements:
- |
``_extract_image_sources_info`` (used by image-aware converters and
extractors such as ``LLMDocumentContentExtractor`` and
``DocumentToImageContent``) now treats the per-document ``file_path``
metadata field as untrusted. Absolute paths are rejected and relative
paths are resolved and verified to remain inside the configured
``root_path``; values that escape via ``..`` segments raise a ``ValueError``
instead of silently reading files outside the directory.