Skip to content

Commit 880057f

Browse files
feat: add mime_type to ByteStream in LibreOfficeFileConverter (#3057)
* feat: add mime_type to ByteStream in LibreOfficeFileConverter * fix: address copilot docstring and test comments * check returned mime_type in integration tests --------- Co-authored-by: Julian Risch <julian.risch@deepset.ai>
1 parent 239e09b commit 880057f

2 files changed

Lines changed: 68 additions & 4 deletions

File tree

integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5+
import mimetypes
56
import os
67
import shutil
78
import subprocess
@@ -109,6 +110,26 @@ class LibreOfficeFileConverter:
109110
"""A non-exhaustive mapping of supported conversion types by this component.
110111
See https://help.libreoffice.org/latest/en-GB/text/shared/guide/convertfilters.html for more information."""
111112

113+
MIME_TYPE_FALLBACKS: ClassVar[dict[str, str]] = {
114+
"pdf": "application/pdf",
115+
"doc": "application/msword",
116+
"docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
117+
"odt": "application/vnd.oasis.opendocument.text",
118+
"rtf": "application/rtf",
119+
"txt": "text/plain",
120+
"html": "text/html",
121+
"xlsx": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
122+
"xls": "application/vnd.ms-excel",
123+
"ods": "application/vnd.oasis.opendocument.spreadsheet",
124+
"csv": "text/csv",
125+
"pptx": "application/vnd.openxmlformats-officedocument.presentationml.presentation",
126+
"ppt": "application/vnd.ms-powerpoint",
127+
"odp": "application/vnd.oasis.opendocument.presentation",
128+
"epub": "application/epub+zip",
129+
"png": "image/png",
130+
"jpg": "image/jpeg",
131+
}
132+
112133
def __init__(
113134
self,
114135
output_file_type: OUTPUT_FILE_TYPE | None = None,
@@ -224,6 +245,12 @@ def _validate_args(self, output_file_type: str, input_file_type: str | None = No
224245
)
225246
raise ValueError(msg)
226247

248+
def _resolve_mime_type(self, output_path: Path, output_file_type: str) -> str | None:
249+
mime_type, _ = mimetypes.guess_type(str(output_path))
250+
if mime_type is None:
251+
return self.MIME_TYPE_FALLBACKS.get(output_file_type)
252+
return mime_type
253+
227254
@component.output_types(output=list[ByteStream])
228255
def run(
229256
self,
@@ -269,14 +296,24 @@ def run(
269296
output_path, args = self._get_conversion_args(tmp_path, tmpdir, resolved_output_file_type)
270297

271298
subprocess.run(args, check=True) # noqa: S603 - ruff doesn't know the arguments have been validated
272-
outputs.append(ByteStream(data=output_path.read_bytes()))
299+
outputs.append(
300+
ByteStream(
301+
data=output_path.read_bytes(),
302+
mime_type=self._resolve_mime_type(output_path, resolved_output_file_type),
303+
)
304+
)
273305
continue
274306

275307
self._validate_args(resolved_output_file_type, str(source).split(".")[-1])
276308
output_path, args = self._get_conversion_args(source, tmpdir, resolved_output_file_type)
277309

278310
subprocess.run(args, check=True) # noqa: S603
279-
outputs.append(ByteStream(data=output_path.read_bytes()))
311+
outputs.append(
312+
ByteStream(
313+
data=output_path.read_bytes(),
314+
mime_type=self._resolve_mime_type(output_path, resolved_output_file_type),
315+
)
316+
)
280317

281318
return {"output": outputs}
282319

@@ -329,7 +366,12 @@ async def run_async(
329366
process = await create_subprocess_exec(*args)
330367
# Wait for process to complete as only one instance of soffice can occur at once
331368
await process.wait()
332-
outputs.append(ByteStream(data=output_path.read_bytes()))
369+
outputs.append(
370+
ByteStream(
371+
data=output_path.read_bytes(),
372+
mime_type=self._resolve_mime_type(output_path, resolved_output_file_type),
373+
)
374+
)
333375
continue
334376

335377
self._validate_args(resolved_output_file_type, str(source).split(".")[-1])
@@ -339,6 +381,11 @@ async def run_async(
339381
# Wait for process to complete as only one instance of soffice can occur at once
340382
await process.wait()
341383

342-
outputs.append(ByteStream(data=output_path.read_bytes()))
384+
outputs.append(
385+
ByteStream(
386+
data=output_path.read_bytes(),
387+
mime_type=self._resolve_mime_type(output_path, resolved_output_file_type),
388+
)
389+
)
343390

344391
return {"output": outputs}

integrations/libreoffice/tests/test_converter.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ def test_run(self, converter: LibreOfficeFileConverter, test_files_path: Path) -
8383
for stream in output:
8484
assert isinstance(stream, ByteStream)
8585
assert len(stream.data) > 0
86+
assert stream.mime_type == "application/pdf"
8687

8788
@pytest.mark.integration
8889
def test_run_bytestream_source(self, converter: LibreOfficeFileConverter, test_files_path: Path) -> None:
@@ -95,6 +96,7 @@ def test_run_bytestream_source(self, converter: LibreOfficeFileConverter, test_f
9596
assert len(output) == 1
9697
assert isinstance(output[0], ByteStream)
9798
assert len(output[0].data) > 0
99+
assert output[0].mime_type == "application/pdf"
98100

99101
@pytest.mark.asyncio
100102
@pytest.mark.integration
@@ -112,6 +114,7 @@ async def test_run_async(self, converter: LibreOfficeFileConverter, test_files_p
112114
for stream in output:
113115
assert isinstance(stream, ByteStream)
114116
assert len(stream.data) > 0
117+
assert stream.mime_type == "application/pdf"
115118

116119
@pytest.mark.asyncio
117120
@pytest.mark.integration
@@ -127,3 +130,17 @@ async def test_run_async_bytestream_source(
127130
assert len(output) == 1
128131
assert isinstance(output[0], ByteStream)
129132
assert len(output[0].data) > 0
133+
assert output[0].mime_type == "application/pdf"
134+
135+
def test_resolve_mime_type(self, mock_converter: LibreOfficeFileConverter) -> None:
136+
with patch("mimetypes.guess_type", return_value=("application/pdf", None)):
137+
mime = mock_converter._resolve_mime_type(Path("test.pdf"), "pdf")
138+
assert mime == "application/pdf"
139+
140+
with patch("mimetypes.guess_type", return_value=(None, None)):
141+
mime = mock_converter._resolve_mime_type(Path("test.docx"), "docx")
142+
assert mime == "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
143+
144+
with patch("mimetypes.guess_type", return_value=(None, None)):
145+
mime = mock_converter._resolve_mime_type(Path("test.unknown"), "unknown")
146+
assert mime is None

0 commit comments

Comments
 (0)