Skip to content

Commit a02cc9c

Browse files
committed
Address review comments
1 parent 445de96 commit a02cc9c

4 files changed

Lines changed: 89 additions & 35 deletions

File tree

.github/workflows/libreoffice.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,18 +31,18 @@ jobs:
3131
fail-fast: false
3232
matrix:
3333
os: [ubuntu-latest, windows-latest, macos-latest]
34-
python-version: ["3.10", "3.13"]
34+
python-version: ["3.10", "3.14"]
3535

3636
steps:
3737
- name: Support longpaths
3838
if: matrix.os == 'windows-latest'
3939
working-directory: .
4040
run: git config --system core.longpaths true
4141

42-
- uses: actions/checkout@v6
42+
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
4343

4444
- name: Set up Python ${{ matrix.python-version }}
45-
uses: actions/setup-python@v6
45+
uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
4646
with:
4747
python-version: ${{ matrix.python-version }}
4848

@@ -86,6 +86,6 @@ jobs:
8686
if: failure() && github.event_name == 'schedule'
8787
runs-on: ubuntu-slim
8888
steps:
89-
- uses: deepset-ai/notify-slack-action@v1
89+
- uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1
9090
with:
9191
slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }}

integrations/libreoffice/pydoc/config_docusaurus.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ processors:
77
documented_only: true
88
skip_empty_modules: true
99
renderer:
10-
description: Haystack 2.x component to convert files using LibreOffice
10+
description: LibreOffice integration for Haystack
1111
id: integrations-libreoffice
1212
filename: libreoffice.md
13-
title: LibreOffice File Converter
13+
title: LibreOffice

integrations/libreoffice/pyproject.toml

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@ build-backend = "hatchling.build"
55
[project]
66
name = "libreoffice-haystack"
77
dynamic = ["version"]
8-
description = "Haystack 2.x component to convert files using LibreOffice."
8+
description = "LibreOffice integration for Haystack"
99
readme = "README.md"
1010
requires-python = ">=3.10"
1111
license = "Apache-2.0"
1212
keywords = []
13-
authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }, {name = "Max Swain"}]
13+
authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }, { name = "Max Swain" }]
1414
classifiers = [
1515
"License :: OSI Approved :: Apache Software License",
1616
"Development Status :: 4 - Beta",
@@ -19,6 +19,7 @@ classifiers = [
1919
"Programming Language :: Python :: 3.11",
2020
"Programming Language :: Python :: 3.12",
2121
"Programming Language :: Python :: 3.13",
22+
"Programming Language :: Python :: 3.14",
2223
"Programming Language :: Python :: Implementation :: CPython",
2324
"Programming Language :: Python :: Implementation :: PyPy",
2425
]
@@ -82,6 +83,13 @@ select = [
8283
"ARG",
8384
"B",
8485
"C",
86+
"D102", # Missing docstring in public method
87+
"D103", # Missing docstring in public function
88+
"D205", # 1 blank line required between summary line and description
89+
"D209", # Closing triple quotes go to new line
90+
"D213", # summary lines must be positioned on the second physical line of the docstring
91+
"D417", # Missing argument descriptions in the docstring
92+
"D419", # Docstring is empty
8593
"DTZ",
8694
"E",
8795
"EM",
@@ -130,7 +138,7 @@ ban-relative-imports = "parents"
130138

131139
[tool.ruff.lint.per-file-ignores]
132140
# Tests can use magic values, assertions, relative imports, and don't need type annotations
133-
"tests/**/*" = ["PLR2004", "S101", "TID252", "ANN"]
141+
"tests/**/*" = ["D", "PLR2004", "S101", "TID252", "ANN"]
134142

135143
[tool.coverage.run]
136144
source = ["haystack_integrations"]

integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py

Lines changed: 72 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -86,9 +86,38 @@ class LibreOfficeFileConverter:
8686
"ppt": frozenset(["pdf", "pptx", "odp", "html", "png", "jpg"]),
8787
"odp": frozenset(["pdf", "pptx", "ppt", "html", "png", "jpg"]),
8888
}
89+
"""A non-exhaustive mapping of supported conversion types by this component.
90+
See https://help.libreoffice.org/latest/en-GB/text/shared/guide/convertfilters.html for more information."""
8991

90-
def __init__(self) -> None:
91-
"""Check whether soffice is installed."""
92+
def __init__(
93+
self,
94+
output_file_type: Literal[
95+
"doc",
96+
"docx",
97+
"odt",
98+
"rtf",
99+
"txt",
100+
"html",
101+
"xlsx",
102+
"xls",
103+
"ods",
104+
"csv",
105+
"pptx",
106+
"ppt",
107+
"odp",
108+
"epub",
109+
"png",
110+
"jpg",
111+
]
112+
| None = None,
113+
) -> None:
114+
"""
115+
Check whether soffice is installed.
116+
117+
:param output_file_type:
118+
Target file format to convert to. Must be a valid conversion target for
119+
each source's input type — see :attr:`SUPPORTED_TYPES` for the full mapping.
120+
"""
92121
soffice_path = shutil.which("soffice")
93122
if soffice_path is None:
94123
msg = """LibreOffice (soffice) is required but not installed or not in PATH.
@@ -97,6 +126,7 @@ def __init__(self) -> None:
97126
raise FileNotFoundError(msg)
98127

99128
self.soffice_path = soffice_path
129+
self.output_file_type = output_file_type
100130

101131
def to_dict(self) -> dict[str, Any]:
102132
"""
@@ -127,12 +157,12 @@ def _get_conversion_args(
127157
128158
:param source: Source file path.
129159
:param output_directory: Output directory to save converted files to.
130-
:param output_file_type: Target file format extension (e.g. ``"pdf"``).
131-
:returns: Tuple of ``(output_path, soffice_args)`` where ``output_path`` is the
132-
expected path of the converted file and ``soffice_args`` is the list of
133-
arguments to pass to ``soffice``.
134-
:raises FileNotFoundError: If ``source`` does not exist.
135-
:raises OSError: If ``output_directory`` does not exist or is not writable.
160+
:param output_file_type: Target file format extension (e.g. `"pdf"`).
161+
:returns: Tuple of `(output_path, soffice_args)` where `output_path` is the
162+
expected path of the converted file and `soffice_args` is the list of
163+
arguments to pass to `soffice`.
164+
:raises FileNotFoundError: If `source` does not exist.
165+
:raises OSError: If `output_directory` does not exist or is not writable.
136166
"""
137167
source_path = Path(source)
138168
output_path = Path(output_directory)
@@ -164,12 +194,12 @@ def _validate_args(self, output_file_type: str, input_file_type: str | None = No
164194
165195
:param output_file_type: Target file format extension to convert to.
166196
:param input_file_type: Source file format extension. If provided, validates that
167-
it is a supported input type and that ``output_file_type`` is a valid conversion
197+
it is a supported input type and that `output_file_type` is a valid conversion
168198
target for it.
169-
:raises ValueError: If ``input_file_type`` is not in :attr:`SUPPORTED_TYPES`, or if
170-
``output_file_type`` is not a valid conversion target for the given ``input_file_type``.
199+
:raises ValueError: If `input_file_type` is not in :attr:`SUPPORTED_TYPES`, or if
200+
`output_file_type` is not a valid conversion target for the given `input_file_type`.
171201
"""
172-
# Cannot validate conversion types if input conversions is not known - i.e., source is ``ByteStream``
202+
# Cannot validate conversion types if input conversions is not known - i.e., source is `ByteStream`
173203
if input_file_type is None:
174204
return
175205

@@ -206,28 +236,36 @@ def run(
206236
"epub",
207237
"png",
208238
"jpg",
209-
],
239+
]
240+
| None = None,
210241
) -> LibreOfficeFileConverterOutput:
211242
"""
212243
Convert office files to the specified output format using LibreOffice.
213244
214245
:param sources:
215-
List of sources to convert. Each source can be a file path (``str`` or
216-
``Path``) or a ``ByteStream``. For ``ByteStream`` sources, the input file
217-
type cannot be inferred from the filename, so only ``output_file_type`` is
246+
List of sources to convert. Each source can be a file path (`str` or
247+
`Path`) or a `ByteStream`. For `ByteStream` sources, the input file
248+
type cannot be inferred from the filename, so only `output_file_type` is
218249
validated (not the source type).
219250
:param output_file_type:
220251
Target file format to convert to. Must be a valid conversion target for
221252
each source's input type — see :attr:`SUPPORTED_TYPES` for the full mapping.
253+
If set, it will override the `output_file_type` parameter provided during initialization.
222254
:returns:
223255
A dictionary with the following key:
224-
- ``output``: List of ``ByteStream`` objects containing the converted file
225-
data, in the same order as ``sources``.
256+
- `output`: List of `ByteStream` objects containing the converted file
257+
data, in the same order as `sources`.
226258
:raises FileNotFoundError: If a source file path does not exist.
227259
:raises OSError: If the internal temporary output directory is not writable.
228260
:raises ValueError: If a source's file type is not in :attr:`SUPPORTED_TYPES`,
229-
or if ``output_file_type`` is not a valid conversion target for it.
261+
or if `output_file_type` is not a valid conversion target for it,
262+
or if `output_file_type` has not been provided anywhere.
230263
"""
264+
if output_file_type is None and self.output_file_type is None:
265+
msg = "output_file_type must be provided either during initialization or for this method"
266+
raise ValueError(msg)
267+
output_file_type = output_file_type or self.output_file_type
268+
231269
outputs: list[ByteStream] = []
232270
with TemporaryDirectory() as tmpdir:
233271
for source in sources:
@@ -272,30 +310,38 @@ async def run_async(
272310
"epub",
273311
"png",
274312
"jpg",
275-
],
313+
]
314+
| None = None,
276315
) -> LibreOfficeFileConverterOutput:
277316
"""
278317
Asynchronously convert office files to the specified output format using LibreOffice.
279318
280319
This is the asynchronous version of the `run` method with the same parameters and return values.
281320
282321
:param sources:
283-
List of sources to convert. Each source can be a file path (``str`` or
284-
``Path``) or a ``ByteStream``. For ``ByteStream`` sources, the input file
285-
type cannot be inferred from the filename, so only ``output_file_type`` is
322+
List of sources to convert. Each source can be a file path (`str` or
323+
`Path`) or a `ByteStream`. For `ByteStream` sources, the input file
324+
type cannot be inferred from the filename, so only `output_file_type` is
286325
validated (not the source type).
287326
:param output_file_type:
288327
Target file format to convert to. Must be a valid conversion target for
289328
each source's input type — see :attr:`SUPPORTED_TYPES` for the full mapping.
329+
If set, it will override the `output_file_type` parameter provided during initialization.
290330
:returns:
291331
A dictionary with the following key:
292-
- ``output``: List of ``ByteStream`` objects containing the converted file
293-
data, in the same order as ``sources``.
332+
- `output`: List of `ByteStream` objects containing the converted file
333+
data, in the same order as `sources`.
294334
:raises FileNotFoundError: If a source file path does not exist.
295335
:raises OSError: If the internal temporary output directory is not writable.
296336
:raises ValueError: If a source's file type is not in :attr:`SUPPORTED_TYPES`,
297-
or if ``output_file_type`` is not a valid conversion target for it.
337+
or if `output_file_type` is not a valid conversion target for it,
338+
or if `output_file_type` has not been provided anywhere.
298339
"""
340+
if output_file_type is None and self.output_file_type is None:
341+
msg = "output_file_type must be provided either during initialization or for this method"
342+
raise ValueError(msg)
343+
output_file_type = output_file_type or self.output_file_type
344+
299345
outputs: list[ByteStream] = []
300346
with TemporaryDirectory() as tmpdir:
301347
for source in sources:

0 commit comments

Comments
 (0)