diff --git a/haystack/components/converters/image/image_utils.py b/haystack/components/converters/image/image_utils.py index 1775ae2aa5..746ceb85a4 100644 --- a/haystack/components/converters/image/image_utils.py +++ b/haystack/components/converters/image/image_utils.py @@ -239,6 +239,7 @@ def _extract_image_sources_info( invalid, the MIME type is not supported, or the page number is missing for a PDF document. """ images_source_info: list[_ImageSourceInfo] = [] + resolved_root: Path | None = Path(root_path).resolve() if root_path else None for doc in documents: file_path = doc.meta.get(file_path_meta_field) if file_path is None: @@ -247,7 +248,21 @@ def _extract_image_sources_info( f" Please ensure that the documents you are trying to convert have this key set." ) - resolved_file_path = Path(root_path, file_path) + # Document metadata is treated as untrusted: it commonly originates from upstream + # converters or user uploads. Reject absolute paths and resolve against root_path so + # ``../`` segments cannot escape the configured directory. + candidate = Path(file_path) + if candidate.is_absolute(): + raise ValueError( + f"Document with ID '{doc.id}' has an absolute file path '{file_path}' in metadata; " + f"only paths relative to root_path are supported." + ) + resolved_file_path = (resolved_root / candidate).resolve() if resolved_root else candidate.resolve() + if resolved_root is not None and resolved_root != resolved_file_path and resolved_root not in resolved_file_path.parents: + raise ValueError( + f"Document with ID '{doc.id}' resolves to '{resolved_file_path}', which is outside the " + f"configured root_path '{resolved_root}'." + ) if not resolved_file_path.is_file(): raise ValueError( f"Document with ID '{doc.id}' has an invalid file path '{resolved_file_path}'. " diff --git a/releasenotes/notes/image-utils-reject-paths-outside-root-3f87c01ad9fbb245.yaml b/releasenotes/notes/image-utils-reject-paths-outside-root-3f87c01ad9fbb245.yaml new file mode 100644 index 0000000000..4d5700a87a --- /dev/null +++ b/releasenotes/notes/image-utils-reject-paths-outside-root-3f87c01ad9fbb245.yaml @@ -0,0 +1,10 @@ +--- +enhancements: + - | + ``_extract_image_sources_info`` (used by image-aware converters and + extractors such as ``LLMDocumentContentExtractor`` and + ``DocumentToImageContent``) now treats the per-document ``file_path`` + metadata field as untrusted. Absolute paths are rejected and relative + paths are resolved and verified to remain inside the configured + ``root_path``; values that escape via ``..`` segments raise a ``ValueError`` + instead of silently reading files outside the directory.