From 445de96f7a58b9dde20b848aac2dea5f231dff02 Mon Sep 17 00:00:00 2001 From: Max Swain <89113255+maxdswain@users.noreply.github.com> Date: Fri, 20 Mar 2026 17:34:54 +0000 Subject: [PATCH 01/10] feat: Add new LibreOfficeFileConverter integration --- .github/labeler.yml | 5 + .github/workflows/libreoffice.yml | 91 +++++ README.md | 1 + integrations/libreoffice/LICENSE.txt | 201 +++++++++++ integrations/libreoffice/README.md | 12 + .../libreoffice/pydoc/config_docusaurus.yml | 13 + integrations/libreoffice/pyproject.toml | 151 ++++++++ .../converters/libreoffice/__init__.py | 7 + .../converters/libreoffice/converter.py | 325 ++++++++++++++++++ .../components/converters/py.typed | 0 integrations/libreoffice/tests/__init__.py | 3 + .../libreoffice/tests/test_converter.py | 129 +++++++ .../tests/test_files/doc/sample_doc.doc | Bin 0 -> 12800 bytes .../tests/test_files/ppt/sample_ppt.ppt | Bin 0 -> 499200 bytes .../xls/basic_tables_two_sheets.xls | Bin 0 -> 46080 bytes 15 files changed, 938 insertions(+) create mode 100644 .github/workflows/libreoffice.yml create mode 100644 integrations/libreoffice/LICENSE.txt create mode 100644 integrations/libreoffice/README.md create mode 100644 integrations/libreoffice/pydoc/config_docusaurus.yml create mode 100644 integrations/libreoffice/pyproject.toml create mode 100644 integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/__init__.py create mode 100644 integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py create mode 100644 integrations/libreoffice/src/haystack_integrations/components/converters/py.typed create mode 100644 integrations/libreoffice/tests/__init__.py create mode 100644 integrations/libreoffice/tests/test_converter.py create mode 100644 integrations/libreoffice/tests/test_files/doc/sample_doc.doc create mode 100644 integrations/libreoffice/tests/test_files/ppt/sample_ppt.ppt create mode 100644 integrations/libreoffice/tests/test_files/xls/basic_tables_two_sheets.xls diff --git a/.github/labeler.yml b/.github/labeler.yml index 0929acebe7..59a97a2840 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -118,6 +118,11 @@ integration:lara: - any-glob-to-any-file: "integrations/lara/**/*" - any-glob-to-any-file: ".github/workflows/lara.yml" +integration:libreoffice: + - changed-files: + - any-glob-to-any-file: "integrations/libreoffice/**/*" + - any-glob-to-any-file: ".github/workflows/libreoffice.yml" + integration:llama_cpp: - changed-files: - any-glob-to-any-file: "integrations/llama_cpp/**/*" diff --git a/.github/workflows/libreoffice.yml b/.github/workflows/libreoffice.yml new file mode 100644 index 0000000000..869d00ea79 --- /dev/null +++ b/.github/workflows/libreoffice.yml @@ -0,0 +1,91 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / libreoffice + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "integrations/libreoffice/**" + - "!integrations/libreoffice/*.md" + - ".github/workflows/libreoffice.yml" + +defaults: + run: + working-directory: integrations/libreoffice + +concurrency: + group: libreoffice-${{ github.head_ref }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + +jobs: + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: ["3.10", "3.13"] + + steps: + - name: Support longpaths + if: matrix.os == 'windows-latest' + working-directory: . + run: git config --system core.longpaths true + + - uses: actions/checkout@v6 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install --upgrade hatch + - name: Lint + if: matrix.python-version == '3.10' && runner.os == 'Linux' + run: hatch run fmt-check && hatch run test:types + + - name: Install LibreOffice headless (Ubuntu) + if: runner.os == 'Linux' + run: sudo apt-get update && sudo apt-get install -y libreoffice-common libreoffice-writer libreoffice-calc + + - name: Install LibreOffice headless (Windows) + if: runner.os == 'Windows' + run: choco install libreoffice -y + + - name: Install LibreOffice headless (macOS) + if: runner.os == 'macOS' + run: brew install --cask libreoffice + + - name: Run tests + run: hatch run test:cov-retry + + - name: Run unit tests with lowest direct dependencies + run: | + hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt + hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt + hatch run test:unit + + - name: Nightly - run tests with Haystack main branch + if: github.event_name == 'schedule' + run: | + hatch env prune + hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main + hatch run test:cov-retry + + + notify-slack-on-failure: + needs: run + if: failure() && github.event_name == 'schedule' + runs-on: ubuntu-slim + steps: + - uses: deepset-ai/notify-slack-action@v1 + with: + slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }} diff --git a/README.md b/README.md index 7319a7bde8..eb91034b44 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta | [jina-haystack](integrations/jina/) | Connector, Embedder, Ranker | [![PyPI - Version](https://img.shields.io/pypi/v/jina-haystack.svg)](https://pypi.org/project/jina-haystack) | [![Test / jina](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml) | | [langfuse-haystack](integrations/langfuse/) | Tracer | [![PyPI - Version](https://img.shields.io/pypi/v/langfuse-haystack.svg?color=orange)](https://pypi.org/project/langfuse-haystack) | [![Test / langfuse](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/langfuse.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/langfuse.yml) | | [lara-haystack](integrations/lara/) | Translator | [![PyPI - Version](https://img.shields.io/pypi/v/lara-haystack.svg)](https://pypi.org/project/lara-haystack) | [![Test / lara](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/lara.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/lara.yml) | +| [libreoffice-haystack](integrations/libreoffice/) | Converter | [![PyPI - Version](https://img.shields.io/pypi/v/libreoffice-haystack.svg)](https://pypi.org/project/libreoffice-haystack) | [![Test / libreoffice](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/libreoffice.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/libreoffice.yml) | | [llama-cpp-haystack](integrations/llama_cpp/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/llama-cpp-haystack.svg?color=orange)](https://pypi.org/project/llama-cpp-haystack) | [![Test / llama-cpp](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/llama_cpp.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/llama_cpp.yml) | | [llama-stack-haystack](integrations/llama_stack/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/llama-stack-haystack.svg?color=orange)](https://pypi.org/project/llama-stack-haystack) | [![Test / llama-stack](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/llama_stack.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/llama_stack.yml) | | [mcp-haystack](integrations/mcp/) | Tool | [![PyPI - Version](https://img.shields.io/pypi/v/mcp-haystack.svg?color=orange)](https://pypi.org/project/mcp-haystack) | [![Test / mcp](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/mcp.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/mcp.yml) | diff --git a/integrations/libreoffice/LICENSE.txt b/integrations/libreoffice/LICENSE.txt new file mode 100644 index 0000000000..6134ab324f --- /dev/null +++ b/integrations/libreoffice/LICENSE.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2023-present deepset GmbH + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/integrations/libreoffice/README.md b/integrations/libreoffice/README.md new file mode 100644 index 0000000000..92bea2c831 --- /dev/null +++ b/integrations/libreoffice/README.md @@ -0,0 +1,12 @@ +# libreoffice-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/libreoffice-haystack.svg)](https://pypi.org/project/libreoffice-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/libreoffice-haystack.svg)](https://pypi.org/project/libreoffice-haystack) + +- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/libreoffice/CHANGELOG.md) + +--- + +## Contributing + +Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md). diff --git a/integrations/libreoffice/pydoc/config_docusaurus.yml b/integrations/libreoffice/pydoc/config_docusaurus.yml new file mode 100644 index 0000000000..e9efad1d32 --- /dev/null +++ b/integrations/libreoffice/pydoc/config_docusaurus.yml @@ -0,0 +1,13 @@ +loaders: + - modules: + - haystack_integrations.components.converters.libreoffice.converter + search_path: [../src] +processors: + - type: filter + documented_only: true + skip_empty_modules: true +renderer: + description: Haystack 2.x component to convert files using LibreOffice + id: integrations-libreoffice + filename: libreoffice.md + title: LibreOffice File Converter diff --git a/integrations/libreoffice/pyproject.toml b/integrations/libreoffice/pyproject.toml new file mode 100644 index 0000000000..f979c41c93 --- /dev/null +++ b/integrations/libreoffice/pyproject.toml @@ -0,0 +1,151 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "libreoffice-haystack" +dynamic = ["version"] +description = "Haystack 2.x component to convert files using LibreOffice." +readme = "README.md" +requires-python = ">=3.10" +license = "Apache-2.0" +keywords = [] +authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }, {name = "Max Swain"}] +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = ["haystack-ai"] + +[project.urls] +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/libreoffice#readme" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/libreoffice" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + +[tool.hatch.version] +source = "vcs" +tag-pattern = 'integrations\/libreoffice-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/libreoffice-v[0-9]*"' + +[tool.hatch.envs.default] +installer = "uv" +dependencies = ["haystack-pydoc-tools", "ruff"] + +[tool.hatch.envs.default.scripts] +docs = ["haystack-pydoc pydoc/config_docusaurus.yml"] +fmt = "ruff check --fix {args}; ruff format {args}" +fmt-check = "ruff check {args} && ruff format --check {args}" + +[tool.hatch.envs.test] +dependencies = [ + "pytest", + "pytest-asyncio", + "pytest-cov", + "pytest-rerunfailures", + "mypy", + "pip", +] + +[tool.hatch.envs.test.scripts] +unit = 'pytest -m "not integration" {args:tests}' +integration = 'pytest -m "integration" {args:tests}' +all = 'pytest {args:tests}' +cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x {args:tests}' +types = "mypy -p haystack_integrations.components.converters.libreoffice {args}" + +[tool.mypy] +install_types = true +non_interactive = true +check_untyped_defs = true +disallow_incomplete_defs = true + +[tool.ruff] +line-length = 120 + +[tool.ruff.lint] +select = [ + "A", + "ANN", + "ARG", + "B", + "C", + "DTZ", + "E", + "EM", + "F", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow function calls in argument defaults (common Haystack pattern for Secret.from_env_var) + "B008", + # Ignore checks for possible passwords + "S105", + "S106", + "S107", + # Ignore complexity + "C901", + "PLR0911", + "PLR0912", + "PLR0913", + "PLR0915", + # Allow `Any` type - used legitimately for dynamic types and SDK boundaries + "ANN401", +] + +[tool.ruff.lint.isort] +known-first-party = ["haystack_integrations"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "parents" + +[tool.ruff.lint.per-file-ignores] +# Tests can use magic values, assertions, relative imports, and don't need type annotations +"tests/**/*" = ["PLR2004", "S101", "TID252", "ANN"] + +[tool.coverage.run] +source = ["haystack_integrations"] +branch = true +parallel = false + +[tool.coverage.report] +omit = ["*/tests/*", "*/__init__.py"] +show_missing = true +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] + +[tool.pytest.ini_options] +addopts = "--strict-markers" +markers = [ + "integration: integration tests", +] +log_cli = true +asyncio_default_fixture_loop_scope = "function" diff --git a/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/__init__.py b/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/__init__.py new file mode 100644 index 0000000000..2d46328fd8 --- /dev/null +++ b/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from .converter import LibreOfficeFileConverter + +__all__ = ["LibreOfficeFileConverter"] diff --git a/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py b/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py new file mode 100644 index 0000000000..4106978098 --- /dev/null +++ b/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py @@ -0,0 +1,325 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import os +import shutil +import subprocess +from asyncio import create_subprocess_exec +from collections.abc import Iterable +from pathlib import Path +from tempfile import NamedTemporaryFile, TemporaryDirectory +from typing import Any, ClassVar, Literal, TypedDict + +from haystack import component, default_from_dict, default_to_dict +from haystack.dataclasses import ByteStream +from typing_extensions import Self + + +class LibreOfficeFileConverterOutput(TypedDict): + output: list[ByteStream] + + +@component +class LibreOfficeFileConverter: + """ + Component that uses libreoffice's command line utility (soffice) to convert files into various formats. + + ### Usage examples + + **Simple conversion:** + ```python + from pathlib import Path + + from haystack_integrations.components.converters.libreoffice import LibreOfficeFileConverter + + # Convert documents + converter = LibreOfficeFileConverter() + results = converter.run(sources=[Path("sample.doc")], output_file_type="docx") + print(results["output"]) # [ByteStream(data=b'...', meta={}, mime_type=None)] + ``` + + **Conversion pipeline:** + ```python + from pathlib import Path + + from haystack import Pipeline + from haystack.components.converters import DOCXToDocument + + from haystack_integrations.components.converters.libreoffice import LibreOfficeFileConverter + + # Create pipeline with components + pipeline = Pipeline() + pipeline.add_component("libreoffice_converter", LibreOfficeFileConverter()) + pipeline.add_component("docx_converter", DOCXToDocument()) + + pipeline.connect("libreoffice_converter.output", "docx_converter.sources") + + # Run pipeline and convert legacy documents into Haystack documents + results = pipeline.run( + { + "libreoffice_converter": { + "sources": [Path("sample_doc.doc")], + "output_file_type": "docx", + } + } + ) + print(results["docx_converter"]["documents"]) + ``` + """ + + SUPPORTED_TYPES: ClassVar[dict[str, frozenset[str]]] = { + # Documents + "doc": frozenset(["pdf", "docx", "odt", "rtf", "txt", "html", "epub"]), + "docx": frozenset(["pdf", "doc", "odt", "rtf", "txt", "html", "epub"]), + "odt": frozenset(["pdf", "docx", "doc", "rtf", "txt", "html", "epub"]), + "rtf": frozenset(["pdf", "docx", "doc", "odt", "txt", "html"]), + "txt": frozenset(["pdf", "docx", "doc", "odt", "rtf", "html"]), + "html": frozenset(["pdf", "docx", "doc", "odt", "rtf", "txt"]), + # Spreadsheets + "xlsx": frozenset(["pdf", "xls", "ods", "csv", "html"]), + "xls": frozenset(["pdf", "xlsx", "ods", "csv", "html"]), + "ods": frozenset(["pdf", "xlsx", "xls", "csv", "html"]), + "csv": frozenset(["pdf", "xlsx", "xls", "ods"]), + # Presentations + "pptx": frozenset(["pdf", "ppt", "odp", "html", "png", "jpg"]), + "ppt": frozenset(["pdf", "pptx", "odp", "html", "png", "jpg"]), + "odp": frozenset(["pdf", "pptx", "ppt", "html", "png", "jpg"]), + } + + def __init__(self) -> None: + """Check whether soffice is installed.""" + soffice_path = shutil.which("soffice") + if soffice_path is None: + msg = """LibreOffice (soffice) is required but not installed or not in PATH. + +- Install instructions: https://www.libreoffice.org/get-help/install-howto/""" + raise FileNotFoundError(msg) + + self.soffice_path = soffice_path + + def to_dict(self) -> dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict(self) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> Self: + """ + Deserializes the component from a dictionary. + + :param data: + The dictionary to deserialize from. + :returns: + The deserialized component. + """ + return default_from_dict(cls, data) + + def _get_conversion_args( + self, source: str | Path, output_directory: str | Path, output_file_type: str + ) -> tuple[Path, list[str]]: + """ + Validate source file and return the soffice arguments for conversion. + + :param source: Source file path. + :param output_directory: Output directory to save converted files to. + :param output_file_type: Target file format extension (e.g. ``"pdf"``). + :returns: Tuple of ``(output_path, soffice_args)`` where ``output_path`` is the + expected path of the converted file and ``soffice_args`` is the list of + arguments to pass to ``soffice``. + :raises FileNotFoundError: If ``source`` does not exist. + :raises OSError: If ``output_directory`` does not exist or is not writable. + """ + source_path = Path(source) + output_path = Path(output_directory) + + # Source file must exist + if not source_path.is_file(): + msg = f"{source=} does not exist" + raise FileNotFoundError(msg) + + # Output directory must exist and be writable + if not output_path.is_dir() or not os.access(output_path, os.W_OK): + msg = f"{output_directory=} must exist and be writable" + raise OSError(msg) + + args = [ + self.soffice_path, + "--headless", + "--convert-to", + output_file_type, + "--outdir", + str(output_directory), + str(source), + ] + return (output_path / source_path.name).with_suffix(f".{output_file_type}"), args + + def _validate_args(self, output_file_type: str, input_file_type: str | None = None) -> None: + """ + Validate that the input and output file types are supported. + + :param output_file_type: Target file format extension to convert to. + :param input_file_type: Source file format extension. If provided, validates that + it is a supported input type and that ``output_file_type`` is a valid conversion + target for it. + :raises ValueError: If ``input_file_type`` is not in :attr:`SUPPORTED_TYPES`, or if + ``output_file_type`` is not a valid conversion target for the given ``input_file_type``. + """ + # Cannot validate conversion types if input conversions is not known - i.e., source is ``ByteStream`` + if input_file_type is None: + return + + if input_file_type not in self.SUPPORTED_TYPES: + supported_types = ", ".join(self.SUPPORTED_TYPES) + msg = f"{input_file_type=} is not supported and must be one of type {supported_types}" + raise ValueError(msg) + + if output_file_type not in (output_types := self.SUPPORTED_TYPES[input_file_type]): + supported_types = ", ".join(output_types) + msg = ( + f"{output_file_type=} is not supported for {input_file_type=} and must be one of type {supported_types}" + ) + raise ValueError(msg) + + @component.output_types(output=list[ByteStream]) + def run( + self, + sources: Iterable[str | Path | ByteStream], + output_file_type: Literal[ + "doc", + "docx", + "odt", + "rtf", + "txt", + "html", + "xlsx", + "xls", + "ods", + "csv", + "pptx", + "ppt", + "odp", + "epub", + "png", + "jpg", + ], + ) -> LibreOfficeFileConverterOutput: + """ + Convert office files to the specified output format using LibreOffice. + + :param sources: + List of sources to convert. Each source can be a file path (``str`` or + ``Path``) or a ``ByteStream``. For ``ByteStream`` sources, the input file + type cannot be inferred from the filename, so only ``output_file_type`` is + validated (not the source type). + :param output_file_type: + Target file format to convert to. Must be a valid conversion target for + each source's input type — see :attr:`SUPPORTED_TYPES` for the full mapping. + :returns: + A dictionary with the following key: + - ``output``: List of ``ByteStream`` objects containing the converted file + data, in the same order as ``sources``. + :raises FileNotFoundError: If a source file path does not exist. + :raises OSError: If the internal temporary output directory is not writable. + :raises ValueError: If a source's file type is not in :attr:`SUPPORTED_TYPES`, + or if ``output_file_type`` is not a valid conversion target for it. + """ + outputs: list[ByteStream] = [] + with TemporaryDirectory() as tmpdir: + for source in sources: + # Handle case where source is a `ByteStream` using tempfile + if isinstance(source, ByteStream): + with NamedTemporaryFile(mode="wb") as f: + f.write(source.data) + + self._validate_args(output_file_type) + output_path, args = self._get_conversion_args(f.name, tmpdir, output_file_type) + + subprocess.run(args, check=True) # noqa: S603 + outputs.append(ByteStream(data=output_path.read_bytes())) + continue + + self._validate_args(output_file_type, str(source).split(".")[-1]) + output_path, args = self._get_conversion_args(source, tmpdir, output_file_type) + + subprocess.run(args, check=True) # noqa: S603 + outputs.append(ByteStream(data=output_path.read_bytes())) + + return {"output": outputs} + + @component.output_types(output=list[ByteStream]) + async def run_async( + self, + sources: Iterable[str | Path | ByteStream], + output_file_type: Literal[ + "doc", + "docx", + "odt", + "rtf", + "txt", + "html", + "xlsx", + "xls", + "ods", + "csv", + "pptx", + "ppt", + "odp", + "epub", + "png", + "jpg", + ], + ) -> LibreOfficeFileConverterOutput: + """ + Asynchronously convert office files to the specified output format using LibreOffice. + + This is the asynchronous version of the `run` method with the same parameters and return values. + + :param sources: + List of sources to convert. Each source can be a file path (``str`` or + ``Path``) or a ``ByteStream``. For ``ByteStream`` sources, the input file + type cannot be inferred from the filename, so only ``output_file_type`` is + validated (not the source type). + :param output_file_type: + Target file format to convert to. Must be a valid conversion target for + each source's input type — see :attr:`SUPPORTED_TYPES` for the full mapping. + :returns: + A dictionary with the following key: + - ``output``: List of ``ByteStream`` objects containing the converted file + data, in the same order as ``sources``. + :raises FileNotFoundError: If a source file path does not exist. + :raises OSError: If the internal temporary output directory is not writable. + :raises ValueError: If a source's file type is not in :attr:`SUPPORTED_TYPES`, + or if ``output_file_type`` is not a valid conversion target for it. + """ + outputs: list[ByteStream] = [] + with TemporaryDirectory() as tmpdir: + for source in sources: + # Handle case where source is a `ByteStream` using tempfile + if isinstance(source, ByteStream): + with NamedTemporaryFile(mode="wb") as f: + f.write(source.data) + + self._validate_args(output_file_type) + output_path, args = self._get_conversion_args(f.name, tmpdir, output_file_type) + + process = await create_subprocess_exec(*args) + # Wait for process to complete as only one instance of soffice can occur at once + await process.wait() + outputs.append(ByteStream(data=output_path.read_bytes())) + continue + + self._validate_args(output_file_type, str(source).split(".")[-1]) + output_path, args = self._get_conversion_args(source, tmpdir, output_file_type) + + process = await create_subprocess_exec(*args) + # Wait for process to complete as only one instance of soffice can occur at once + await process.wait() + + outputs.append(ByteStream(data=output_path.read_bytes())) + + return {"output": outputs} diff --git a/integrations/libreoffice/src/haystack_integrations/components/converters/py.typed b/integrations/libreoffice/src/haystack_integrations/components/converters/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/libreoffice/tests/__init__.py b/integrations/libreoffice/tests/__init__.py new file mode 100644 index 0000000000..c1764a6e03 --- /dev/null +++ b/integrations/libreoffice/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/libreoffice/tests/test_converter.py b/integrations/libreoffice/tests/test_converter.py new file mode 100644 index 0000000000..2111170067 --- /dev/null +++ b/integrations/libreoffice/tests/test_converter.py @@ -0,0 +1,129 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from collections.abc import Generator +from pathlib import Path +from unittest.mock import patch + +import pytest +from haystack.dataclasses import ByteStream + +from haystack_integrations.components.converters.libreoffice import LibreOfficeFileConverter + + +@pytest.fixture +def converter() -> LibreOfficeFileConverter: + return LibreOfficeFileConverter() + + +@pytest.fixture +def mock_converter() -> Generator[LibreOfficeFileConverter, None]: + with patch("shutil.which", return_value="/usr/bin/soffice"): + yield LibreOfficeFileConverter() + + +@pytest.fixture +def test_files_path() -> Path: + return Path("tests") / "test_files" + + +class TestLibreOfficeFileConverter: + def test_init(self, mock_converter: LibreOfficeFileConverter) -> None: + assert isinstance(mock_converter, LibreOfficeFileConverter) + assert isinstance(mock_converter.soffice_path, str) + + def test_init_raises_when_soffice_not_found(self) -> None: + with patch("shutil.which", return_value=None): + with pytest.raises(FileNotFoundError, match="LibreOffice"): + LibreOfficeFileConverter() + + def test_to_dict(self, mock_converter: LibreOfficeFileConverter) -> None: + data = mock_converter.to_dict() + assert data == { + "type": "haystack_integrations.components.converters.libreoffice.converter.LibreOfficeFileConverter", + "init_parameters": {}, + } + + def test_from_dict(self) -> None: + data = { + "type": "haystack_integrations.components.converters.libreoffice.converter.LibreOfficeFileConverter", + "init_parameters": {}, + } + with patch("shutil.which", return_value="/usr/bin/soffice"): + converter = LibreOfficeFileConverter.from_dict(data) + assert isinstance(converter.soffice_path, str) + + def test_run_unsupported_input_type(self, mock_converter: LibreOfficeFileConverter) -> None: + # .pdf is not a supported input type in SUPPORTED_TYPES + with pytest.raises(ValueError): + mock_converter.run(["test_file.pdf"], output_file_type="docx") + + def test_run_unsupported_output_type(self, mock_converter: LibreOfficeFileConverter) -> None: + # .doc -> .png is not a valid conversion + with pytest.raises(ValueError): + mock_converter.run(["test_file.doc"], output_file_type="png") + + def test_run_no_file(self, mock_converter: LibreOfficeFileConverter) -> None: + with pytest.raises(FileNotFoundError): + mock_converter.run(["nonexistent_file.doc"], output_file_type="docx") + + @pytest.mark.integration + def test_run(self, converter: LibreOfficeFileConverter, test_files_path: Path) -> None: + paths = [ + test_files_path / "doc" / "sample_doc.doc", + test_files_path / "ppt" / "sample_ppt.ppt", + test_files_path / "xls" / "basic_tables_two_sheets.xls", + ] + + results = converter.run(paths, output_file_type="pdf") + + output = results["output"] + assert len(output) == 3 + for stream in output: + assert isinstance(stream, ByteStream) + assert len(stream.data) > 0 + + @pytest.mark.integration + def test_run_bytestream_source(self, converter: LibreOfficeFileConverter, test_files_path: Path) -> None: + source_path = test_files_path / "doc" / "sample_doc.doc" + bytestream = ByteStream(data=source_path.read_bytes()) + + results = converter.run([bytestream], output_file_type="pdf") + + output = results["output"] + assert len(output) == 1 + assert isinstance(output[0], ByteStream) + assert len(output[0].data) > 0 + + @pytest.mark.asyncio + @pytest.mark.integration + async def test_run_async(self, converter: LibreOfficeFileConverter, test_files_path: Path) -> None: + paths = [ + test_files_path / "doc" / "sample_doc.doc", + test_files_path / "ppt" / "sample_ppt.ppt", + test_files_path / "xls" / "basic_tables_two_sheets.xls", + ] + + results = await converter.run_async(paths, output_file_type="pdf") + + output = results["output"] + assert len(output) == 3 + for stream in output: + assert isinstance(stream, ByteStream) + assert len(stream.data) > 0 + + @pytest.mark.asyncio + @pytest.mark.integration + async def test_run_async_bytestream_source( + self, converter: LibreOfficeFileConverter, test_files_path: Path + ) -> None: + source_path = test_files_path / "doc" / "sample_doc.doc" + bytestream = ByteStream(data=source_path.read_bytes()) + + results = await converter.run_async([bytestream], output_file_type="pdf") + + output = results["output"] + assert len(output) == 1 + assert isinstance(output[0], ByteStream) + assert len(output[0].data) > 0 diff --git a/integrations/libreoffice/tests/test_files/doc/sample_doc.doc b/integrations/libreoffice/tests/test_files/doc/sample_doc.doc new file mode 100644 index 0000000000000000000000000000000000000000..70b72d50d5f8d14de23b6f46af63ee40872f60c5 GIT binary patch literal 12800 zcmeI2U2I%O701u@`ZHg3oYVyi&@LnxnGf5l7fU9+}H z`a}Ui`w*m5eV`9W1Sv=eDilgpq^c4T4=6$)`p{S^PeCB0iVCGs6tzW-`Tb}1ve(9I zZ`=^2u(LY%&e!>vbI#12IkT_+arLDi{bIwPZBBZ?5_Wy2&Z=Ye0r|$LTyIuEUUk=J zW@e&lvgPyv(!hIpS!KgYj93N8<~6{bKpn6QSPrZJRsySlI{;k3Rs#*d8elCTI)Zii ze4c#yb(?D7QOok1w9r0j8QwX20?U=28u6I;&`QCKuNS5CU#w4=e?bF_=}PT!{YUG! z>gLY>TKeAT{MVYHZ-nOmdf-FA2H-AWBk*D1ZeSA-0-J#*K>U-uKLTt8w&m;Y;k_fD zFO0IByIC3t(0qagUi7N#^hGy2@1Ay0_B)gw$Ym$8!;|5$Y;N%0uI%GeV?&uq_f0s| ztGcf2fcp|C-%P=^H@Dl>`(OB)C$!KL|MN^2>a;*x1D>{wUC0;Q7s+?b@o9Q%rh>Ih z-?GE3Z#f&Yl#NoB-!%*-g1{0rC2?WMI2({|-CSQ)!Zc74;8nYU~oT^g;+hHMyVrjSSL z?oskN;JAB^+X!_Bky)#;QkSqT2)f8Gvp!2xKMIY|4$xP)nn_1`Y7n>vf!nR)>1#8O zbXOp3wRs`ip?G~$U`@e6de>Mt_$eENRtCQMs;{;+XD_3MlU9um)~(XdzSDSB?S`|39by8^8MmG&w4 zPPo1mZr`xanxMHqkbY`x0LjK_O*111hY^#uIz3r(DQoO!Y8#K9*J<40V zmxmeApp!+VJ53mv%{~DKG~)Ucbf|qxBhp^RgS@n6V_+<4H%&=zK%w?dRJ%@r6gjA=Ww;*dZ_WVBwsXXFk=jO0-+D{KEYcbbvb+n z?2kdGlo{5G8ON+2Oih#BTTA2KiTkDDB%Cyy0__JENiu&wlFI2LGl$meYSK&%b56d{ znD&a(B3#sbz)2=x%Z_Y;G*YDyed@ zexlV1bO3hhyKefpyvn2wJ^BV-e(AC4z~wK!5&Vk0s+XGH{BaN*3hPkA<%VB-B2tr6 zguYvRZByCNRHoh6`X2dbU1g&^s7+!%=@Yh4nwh!7=~n_PFC7a-`MK$7S8-A*Aw_Rr zn*XUjYv*s8xAqkMWzFY!YhMzjibRps=*sR8+k{P0$% z3VZ!lvn=x~YoM%w+e!nyZ1%_4`S^%J7hC!P&hgspr(NAjb~=3=Omr0Z42QJP#T<0l zYyvCAL1Th89S~Y#dsAYI&sL`=>fU*t_8QQ<@4jNPL*s2r1s7%;Sdg*YX zeA=Dk6$J^2fq1M@$ElAVQ%>rTeks>80}5rXHVG9%VpR4Z9VrUO%Gdx#|MWkfCR=SQF6c&h`JBoHSZ zv!sbZyO*8c{nQk6Q5aEy=5A>1w07GC^$;v+TJm|#MHY~}I!tP08uKt*B#|_vSdHv3 zMczmILwd_f5wGk5 zAQDJg(SO2u!TdIkJn1;xynl>3YTkyENHL%FBB|nIj@w(o2(O(?UNZwurbxm*MmmK~ zBKg#J7`=Eq`?)dU{6&5*Pw|o*hnn{W(G|Wl+JxqYh;NVb3tb?jpnF+HzR?ed0ceGQ z#_8>=FpF@qa1Wince+^?N#JKfGi{zn$gGpiMbei__ zOWx@{!AGM99IocD=35$RWfRRcSuM&W9&WR^<~;&yqp<_G^J+($ZcCJ@sA5svOnqb~xTTW z%kjcL;)OTkg+Ij$7vhDN;)M@Zp39NOWbp8CfY|=hY0f1i^*5`JcoYNTc8O$=N(Q!E zHq|`W9M}B53h2&iG_Kjt#|z!@!k&0x7EZh&%HOQ9l|biNr|NZ)wri&kTB6E+v-}(D z9^4{vD!D+4L#cIj)75@74buwc5`A_sPj38ucW8rG5D+1CnTo`y&s0V=wbTr^S)$GE zn_g}SS(`DLFS!Wfze@^p7uuNXzGPLh_1nO^K$3NRE3h3aY+s9Y9OWMGXoYP$_pRpe+;2NKpF3041Z+8d zrYZ^SqP_L>NYFAav3FD2zROm%+|n_Wnf+HZQ0)Fp%SXktA#!O{yZMt5~Wc z68i72{y*j8g{q1Ep5yz@_uLNu=s)A7^4|g>?M#%85rxp-OMK*6k|^pr=^IOUZdk(j mwWdV()>Z$(`Ns6oz6Ac;;s1OIV=J|bGkpG03Qqk0zyAWhCRJYm literal 0 HcmV?d00001 diff --git a/integrations/libreoffice/tests/test_files/ppt/sample_ppt.ppt b/integrations/libreoffice/tests/test_files/ppt/sample_ppt.ppt new file mode 100644 index 0000000000000000000000000000000000000000..b2a9c61935184f9ec2320ee46c28904c06f26e83 GIT binary patch literal 499200 zcmeI52VfM%|Hb!6=p|H9Y|%gV0*D1grKs2e1uQ=sT{=h+utq^qR0KteAfloIDkAnu zvjc(%3Mf`o2rcv`7WLv32K2eKy7d)I18K&&H;76x!^od z7n~0+02hLKpgw2-8iI?!#o!Xq2wVy-1DAs8A$60`!XK^xE(v;*ydNsA8r?+7}9&Y%nE3c7*rpa-}E^aQ;? zZ_o$a3HpM5pg*_^+zsvl_k#Pt{a_HdmXNyRGa1Xf@xQ0*gA4im9O4wsRIc;m&94{v zai5o)j4bT_%ge2~uKVxRYr{nliE!gD|K_|tvCTk$(vX0_8!O8{@j%-#P3Bj4IMtc# ze0b7IqwQBWOK#e|Z@ZpNyLar?JNaBAtC>Cl+YHEbF1#;WlZ@`0y5qJP!~WVi+tf3g z6B77-QJfJ!!*0N_sYmXn)@evB)_@x5TxzZ}so^f6HanN^r*Xa(*U#Xn z0Y{fmGo41QR}*(Nad8gU8c^e%O-*;MtL(Nu3&xq?^ZX<^D-)Jc*$13e3C6}0D({Iz z`*(;i$Hrz8XH)N+Pr$Ya14+)tgz6Ab!}&VNSXt9qm=wHklC!w@*T$psev;w5Iki|< zYPMcZGB&rPZc29Rovze=rY_6-Q>=XXjIbI|Lv_TTsS6|f6yxqTUX2K;Ct+Pg4cL}= z+1T~uI2oq9yY*o*wR}&Gdw{H;V%TlNx{7dmXX_@buk6nhbLwY=b**ck;i*-awQ(@y zV#M0Us{z)Sa5@sZR@l{yeK)tY5OkZ@|6(|pkYvh5irX?VN>S0(3&)w1Wb!=Ohdfe} zB3j>hDXCn*)FSD->(Nd;HrcjnoIR%HV|?P&sc$lrTRzg$*K=gbuML$ixJ~In-Pq*&q}XG7&Da_I;3dT# z+iNCX!4F}i*kgOm@DcnFMv6VQ*9=F&4`HO(V|&f;7W@!Kiaoa13{SxiVWik&d(Chf z{18TpJ+{{j=fMwQq}XG7&7?!{Ll`Oc*j_Vf6Z{ZHiaAbl78Jh_B93;_gWM(h-5FYw zoSg}`8^_Kbx2|+E&+ddXIbpgWZFBkmdR50+Mqb$bUwZoMlsgk#4`)|G(DWW01izbW z^@)FyQ#|o}jvdF|V=_lygPi_ncaNh(qHOX~MsTKgW}=+ae-m68vg?UE($hc4Cdwv? z{wNz0>N!i@nh9gwnyD;(%O+XQU4Ut6Nv5ahI5tIsrNcdCqUe6?wP1-cr`+iCzeb4v zU}ql zm@y+qcjlVY-p0=%t~uFTdvv-pnmC(3W3Lloofa_cq?_E7|2i!@KgTAA)-?+<)oI!J z$*WGw&QD%-T4q1xb7f@+mM_8Z`fff;;3!x#W)3C0n*L?{X0GY8^OKpCkMR>OD+}?H zmZ_%C?mn4WS%sg>HNEv?YB-b1$+@u7)*bgU4lOH&8q13LZ$7TZXWRa88CBAaC*6Lw2*$zWg3NAP z=3=G4PZ0EF^)joKLAQP@t=p_t2HpCtv~IIn8FcHn(z?xRWzenPO6#^qh?Pm1dOmI4 z(T%-nX>?+5S|(-c=~%aWt+b=&&K_%JcH!=|(vD=%Aav6wyKwhfX-8a~J=V%3XKz}X z+NLtm+iPXeZAx5NndFp7tV5AYa*CwYEy=lppbn0=!oGWkd&QZ!CwHdETNV|dSCKh!#7mRgd z&TYf_7-AEkq7KHoi1}`U$p~j#k;Z{BiGc*;e;{XrRwUrA2v+=s70o%%SEpPcJ6l#? z$Bba0@}zXHFPVQ^l3a%C5+8qDL*tT|@3>SO{RnaSyM5n?;}YcU9XK<*nYf$?mqA{N z8Y1AOQlNC2X`kK?<3*hmr+Ur1m>;d#F2D?_*$;5{1Ks^YxU;h|^tWL$k=U91HzSZc zQ>0}^Z#yLAxL6dHqFcmInMLmsGZ}CPin-Oix^w$ z$QVFpC)pj*N@irNtvh~XzISlP$$GoryJQ@Ahl=wt0yCUW+cpUUQD8pxN~~^ zb0)hZMb$Yn_qTkV%3+YNVma`&A|HI25ciF*sigx|Zam~wx|mt{GQ1hS?laNO;LAkT z@MQ|f$>4>gQ`x)4#R>E?XNtg?Y15gw+4D6R%Sj+h+<4=SwDBBgiI4foNv0k#?rdFe z#?l<;6_+z}uf3dIhNB>7#dF|nLM@ZW%;!O8TunODjXtpD_n$cP_Zrdcp*Hv7-6Lewp zt2w#XjQHl|&W!!`!JWzNxrb;*klW|OT{GO7GLlz##ugJ>GZUM;TT{MFK0Dq`=PK+2 zZmnI!oqFKXzOq=aGRc z>A{+jffoD^jtnsQn`va=%2H-zpauPa%p(IuaF@T40mL%mQI5SM1OB*%#wFnZ#>I>b zn7Ejc0mGY#%SzHH*vd{MF4>I?v_0cMj0}7iFe3xY6GMA##EF)aiEY_5vtBmX({&H& z>5{SN!Sr+2=h~jV$#>>sdllxJ@o(+*_fm`IU~l!>2g2S_MUB08l?=7l#MSh-)l1#m zKQ6qz)J!8qIpn7m`N&Tu)uU&m2wmKVks^Q3-qa7-_3ZbKgBEnp{tQ-RsAoSzJ$v=+ zDSCFDpjI4Y?JH}=0e#FI*1M`_A6)g2buL~j4l=JE$*ymz6$g9A^9u9$h3VNZqiJe- z_6zx6Goxp3`X@!0k1;*_w(j+O_U!Q!TyfxUIR`kOUC(pEnR)lbqO0Y+Z*#2vmb2kM zyO#6bF^ht3InQEgwpz|J)N)qKnWCqbv*jRb8&NH1wVZ?Pn_z3$^caFOR>^MZQOkMn zie$S+Kx=h_vw6|qKQJxlPE0+RmUC^2{QXYbCY+OZKXTvsI?7#} z`G%e#%=2kK^OR#@eS*+G!(2Rv{M5Ei-uWrI#w7}FhPi~{j9KQ;r!M#3470{1G%n$P zW-Rj)l(v7G*E3_8{phJn+o!dxm;!A^JM=av!-{!^&V0;58pDd-29@3Thn~9BQm{^a!c_Y7m2YwM%&68&WH(-rl@g-riKQh>BN;XZLj9=WH4B3pxJZ zbkh8GboH>}bACI;*N-eg0SJ5DJ$|z7M{>Rp_alj2zWR~4D>VJce8=Sg^dpIjN!`$X z8F2R(xQ>?ljmk8_4Zuqg|5@G(@5&rxm13CWM5$^IgGJyK;?_WDQfN{z0 z*N*(ZgPcfQvKtwwUhhEkOp=St2~?=mH_Na85U0ZWX15&pT5&-5I*`AEOgSi^U$-%O zkX=u1@4GJw`W@YxVOCF0JvsH{)RVKb#S4nm=f^6-`+Z!kR1VG>XYJ!?hFQOE6H6l8OVwjyaY`9I!2!^}&$!=7QrPqs76<^|seGTdej*Ni%4cAItdB^Y1LFdKC=ecWS{gX$k&BM&Zr4#IgLJnlUax z_30#>YR4ts;Y+@MI}AU;aS4AqiZVu%)R1!*IbC?`bQ`C$lgvul?(Ew+Z8_81{oa9Z$(-%OcjK;``@J{& zey)r5TO=UZ{`6I7g!2qS21fb-cYPTPx34l$nx-TeS}KhWJj zUWAf;#8>kOBTg3H?XlSn^wlXn^hdELWya93yS1mLG?@Ww!j3vZx-BN z=9j>yA|>?}B)gbt=bLJk(JI3O07m{u&&G?1MI43WDQq!>pTs&ACv zsJ=;ylfer~$Fn!z$h|#`4?E9i+P3C%0w_vbj1Mi`9d6BGzBtb0By-ldGP#=H+cX8h zc_qoThRt0PocdfeJ*ycRIlZQjwZt4)o6zV$q`Tt`DmA)d_N*CeOkMdP|8r)|4ER`v z0{skYMRMhB`#$mJ&uv9>;H~YYdGluCyKm)X4{r*rSfX*Y>~dR4LN~mn@IPnX!pUtr z_lY%sZY!1pYt=71Fszx}mOX2x&=hoTTe45QT}F(8+Ggr1Q=1NON~f8_2k5B<*EBvI z`1R8c&hWsH=O1&X2P)zxo$^iDPcS*5UP1s(DDR{dPv)EPWj-k>Deiw~?r5iW?YYj} zx#`ZdY13d}UBcN^8Nqi#8b@WzTc_+(Ol4)tB?^?_o;Cu*r-)O*{h#D4O|m+@EW`Fh z!Yq?`gWIYvlGc>}3;eDAGY#trX`3k$`L2WU6Rd;V`jjld`DDTg&R3;41KnIzvt5AN z*X#$l`+@F0So@l2hf@~!{_<2Hp{Nlbr@Y(VF~4a0UgaCJov&hu80*TWy=R-mHcEawUEYRY*V%DKrI!Bvzq9`={*5)qc|M=Jdf zgOZ%Vm9900IUl)<+~%0Sru;Q`YB@DI|5&9n-0Q1PH>$Yfmd^t<55M#e4o}G_?#|K9 z#T$ybZcf;c%mx34BJO=m?lkR?B^}P6UQzsu7^)$cZ_Fmc_JEIQC zjv0OE>GVQHlhM{t1+C(SlT|^@`MnBSEkr@h)qO4LTYY>N$-?)9{mi#tLCyQt->W<& z+vm|uhBXR$8L=%NGDg*&Yj(7#7ug6J?>E!CGWA56fa8?Nv+gvpG^H(*&XfAmc{qqtoli?xL)r&Nn@lD& zhUC=x9mRE{-$VJI=GSksgwb#F^iIC@8$Ut)zJYtbSjLXG1x)s$qPmEd!=W5LL@h+` z+ASYHnu^D%#|d-JwpXVH(wwxUL-yHhGpHA@Qof&~} zf3tOJsi}dQNee26_F_$JO}%VX$kxx9$F2N5qXZKf9A)1#GJT@qZvSX+&nVK?Q|2@` zl;$_hoj?gYCQ4Kj>PL{~_{lcSIlo`iJR|%t}Y zt2xb%A+;Unh6a@9YaHj1*7RzrtU5g4IA=WRINv^xn>QS1>^qLL>_f+yHQjOEn`uJW z;?^yuP<~R@YTxA9wq0ua&jHib(kI3l*uxM^a8%cCDw)Fh{h{YK?r74X{Ax0Nhcgqlt9D?VNFf>!F|R+qP;G17bi7h=KTFVB6L$ z|LxjEWw2)TzuLrr7!U(uAbuD~UG?|2ZClX)b!%5?69ZyE42XgFVSt)$`}VEU|M*F9 zSt<$F(3w{|H=bmKn#e1_+&u(AD`(itHppA z5ChWxT=Kwfo}tJIk1%KcC>RVL1CN6zz!1QLMUHtQZy0zAJPn3}XTS(B5{v?)!Lwit z7z>^Q;{XpOIpe_#;6?Bfm;hb|uYgyGDtHS_18)N!aB|)S z=FJxGgAV`?JUQlBZS#KDPr#?(Gr&Vo&I~XU%mO_4ECxSj_D ztOjeqTCfhR2Wemf*a$X(&A_a4Gix2Tf$d-i*a^(~=l=kY-k4W5IXw2`KI&zj94rEO zF^iK3iURX4h2o$DC<#h|(!e}nTo#lANx(dZRsmE5l|W@s1snpZfyZB!gG)doa49g)oL>&E09S&mz}28JXabsoYrwUj z8E6h#fa}2Z!2CYMjljI;;wEr2xCPt_ZUZerE6^IW0c}A$&>q|lI)IL#6X*=OfUck$ z=ni^-J3vp+3-ktkz@4Bk=m+|PyTIMx9&j(X58Mw@zyshxFaQh$4}pilAn*uy6buHB zfycoUUdN#JB~3OE&<22KYxKuu5!oB?WsGr?Kl zY;X>!1I`8K0rOiY=YtEtg}^+2ULQ074Z%g=VsHs)1TF>UEv}b?E5McDDsVMu44QzZ z;2Ll(Xa<^t7T`K?J-7kf2>u6d0yl$Oz^&jm&=Rx)tw9^m7PJHH!R?>}=m?g;9D>U%mwqncVIqP0KNwc!4F^&SPXsyKY=A+DOd)62ETyi;8*Y) z_#ONK{se!472t315BL|X1gk(QSPj;IwO}1s57NK}un}wmo52>a6>J0B!49w!>;nIR z-Cz$$2j-b?_cx08p8yg;QBVvN2PHsBPzsa=Wk6X_4kUr{paQ4}DuK$N3OEE*1&4ye zKs9hUI076Ajsi!6WN-{P7W^Nm4tS}Gb38Z!oCr<=CxcVKso*qlI;a6^f?D7VP#c^H z&H`tHb3h$%E;tX=1?Phcz=fb5s1F)|hTtM_F}MUY0+)izz~$fya3#13Tn!q7CcwNu z?iz3{Xa<^t7T`K?JuvT?yAk{k+yrh0w}4y0ZJ;G+1zLkPpe<+z+JoCc2hb680-Zq@ z&=qt8-9Zm<2j~fUf!?4GxD)gR{XlXWZ1LMH+U_5vMya-+b6Tr*h z74RyU2quBa;5G0%cmqrUZ-S}dEietd4c-Cog7?7t-~;d>_y~LqJ^`PC&%ksr1Iz@o zz-;h2_yT+hz5-u^Z@{-;4%kCqXXH}`17aZB8PND&w5OorF9yVb z7>JAk>3?Lr$(I-q17aZB8Ib-*dkQN4Vn7Utfyfw;{zt}}e2D=uAO@nH0qK9Vr=a35 z2E>3Eh>QX0e`LJLmlzNOVj$WXkp4$|3M&3$Kn#e1$QY3RN5-3ci2*Sn2BMt->3_7R zpyDqE#DExxi~;F?WW33j7!U(uAley_{zrQXD*j?X42Xfq7?A!)#+!VJ0Wly3qMZTh zf3&Bd;x7iofEb930qK8ayvdgs5CdW$+8L1kM|%n?{$fB3h=Ir$kp4%;n|z4@F(3w_ zodM~8w5OorF9yVb7>JAk>3?Lr$(I-q17aZB8Ib-*dkQN4Vn7Utfyfw;{zt}}e2D=u zAO@nH0qK9Vr=a352E>3Eh>QX0e`LJLmlzNOVj$WXkp4$|3M&3$Kn#e1$QY3RN5-3c zi2*Sn2BMt->3_7RpyDqE#DExxi~;F?WW33j7!U(uAley_{zrQXD*j?X42Xfq7?A!) z#+!VJ0Wly3qMZThf3&Bd;x7iofEb930qK8ayvdgs5CdW$+8L1kM|%n?{$fB3h=Ir$ zkp4%;n|z4@F(3w_odM~8w5OorF9yVb7>JAk>3?Lr$(I-q17aZB8Ib-*dkQN4Vn7Ut zfyfw;{zt}}e2D=uAO@nH0qK9Vr=a352E>3Eh>QX0e`LJLmlzNOVj$WXkp4$|3M&3$ zKn#e1$QY3RN5-3ci2*Sn2BMt->3_7RpyDqE#DExxi~;F?WW33j7!U(uAley_{zrQX zD*j?X42Xfq7?A!)#+!VJ0Wly3qMZThf3&Bd;x7iofEb930qK8ayvdgs5CdW$+8L1k zM|%n?{$fB3h=Ir$kp4%;n|z4@F(3w_odM~8w5OorF9yVb7>JAk>3?Lr$(I-q17aZB z8Ib-*dkQN4Vn7Utfyfw;{zt}}e2D=uAO@nH0qK9Vr=a352E>3Eh>QX0e`LJLmlzNO zVj$WXkp4$|3M&3$Kn#e1$QY3RN5-3ci2*Sn2BMt->3_7RpyDqE#DExxi~;F?WW33j z7!U(uAley_{zrQXD*j?X42Xfq7?A!)#+!VJ0Wly3qMZThf3&Bd;x7iofEb930qK8a zyvdgs5CdW$+8L1kM|%n?{$fB3h=Ir$kp4%;n|z4@F(3w_odM~8w5OorF9yVb7>JAk z>3?Lr$(I-q17aZB8Ib-*dkQN4Vn7Utfyfw;{zt}}e2D=uAO@nH0qK9Vr=a352E>3E zh>QX0e`LJLmlzNOVj$WXkp4$|3M&3$Kn#e1$QY3RN5-3ci2*Sn2BMt->3_7RpyDqE z#DExxi~;F?WW33j7!U(uAley_{zrQXD*j?X42Xfq7?A!)#+!VJ0Wly3qMZThf3&Bd z;x7iofEb930qK8ayvdgs5CdW$+8L1kM|%n?{$fB3h=Ir$kp4%;n|z4@F(3w_odM~8 zw5OorF9yVb7>JAk>3?Lr$(I-q17aZB87Nfxzjn<^_+-C2^&cW;zjB6S&hXWqvmoOSX+^Yq-h*6~x93Yw?I1yjXLL zgAHr9Y);#Mcf<>w_cXMXn2-+%n}>(BrG^XH1cep|fgdujn3 zFI}>D`7cX2z%eIh&HU`w~BRpMNa3)k-X-wQLasVn7V|87NfxPh@DNfArD&{rdH7 ze|!7aUz@aR=hpAPpLfR{cZ?Y`dfankx^(Hhlnv9_-ew>(HS?)~sGhBcD2j zKpubmG0s2q&_En>a@eq;oP6r3p?BTY|GV$zaE2)P>C&+n5CdW$W*8_``VXt_cuOj> z|LwP5-g3(=eEsC(4}0|JzWcxJH1a!l>a=9Zq8Zaax##Y?Y4C5~w&}?sPfVQn%95Xc z;L_$zY3b?dwA=sv=P%mxrZ(BKf$HGxX;ahF|HC~ZkGwzj*x>Z^-Gd%}Xw=9NR2(~Z zY{AW2Q>T3O@aWcH}nW5On7n7pogCwJ*r*1c1xEoo{s+CbGIGsA2Q?#x(OeC_#Tb_k3at4jn^l? z^Y&XmESyheVfCMu{OhkzV%P^gr#rx45!K4?zx{G&pFRx!;EX!rWu$Pp7bhB_xA1EfBn_x-_84W$dJbw?V(kly6SJV{jUD~ro8bQ zA=sH;?7jW=+Zpp8@ZbYe-+Y690o#x#9)I!0@o0Xx?%n2m`_*^zzIh-eWz4gqcJ17P zuNX^lnIi_ofEch06e|6{;yhz{&zsnY3D!j#N#yVUw-+eJ9_kBik2q&O*h>% zY2qsk?6hgqX3m_i81td&K6&!1xWDhdduiCy)JM9l{^NK1wvAK=gv`E8o7PP7e*5iL z)GzP7`}V|FU+&zw)AHp@nO|mf=-qdwQIF8SSor;S8vj=@5d&f%h8ZZd`k%V;?@paM z(f()XhVh*7&p-F<=#i}UVa$KQ{JBg4%$_x!iCPx?P(d(IgL+dzj2tl>^|lL#FwN-y zUHzxM^=A5>ozxLD^_g65*RCzK02Kt41z|ovZtTy?ej*T74gLPx&*R6B8#QVKx1gdS zgcweGnI;CrK%6sBX!W1=f9uw*8I3{Ak?tMaH}BlBnFAt>P;(LiXOM@{AzJme%RtqD zGb{citJVKLeR_`^`3w~YYR^nTtClUNfA$IW0#yLbd&1ngJtJ7$Q;!e=8;ef)N+5(B z=Q%{?ivckZs|=uWg=hW0CE1F3>Z*Spd1TO{MGH`AMsnhh2;69N08!1(0>VZx!~zz7c=gv}exGi5f#_a)?b&NH%I42Xf)V4x7{zm;Y7 zc7x~0gxU)S8#!igdkxN7>JC4La6_d86sa|Kn#chF%T95(*LmNl4CI- z2E;(jFd+SpnWU5QF0qK9tB&7@y17bi7gvEgLKP3_^5r3?`R zVn7Up#enobEV|@a42S_S5Hk!&|6?X8Wr!FM17aX72BiOC(Iv-XKn#e1m|;NrA2UfQ zL&Sg>5CdT`ApH-EE;$whVn7VU33>*s$*~v^17aX%7?A$Q zOj61aF(3xSKv)b&|HGn7j>Ui&5Cbv8fb>6Rl2V3<0Wly3!eT)B9~ND5EC$4Y7>F4L zr2jFKlrlsNhygJW76a1%u;`LwF(3xSK+G^8{g0WXlp$h342XfS7?A#lMVB0l0Wly3 zVuk_ff6OGM3=sojKn#S%fb>5sy5v|4hygJWGYm-oVjCC6ew z42Xf4VL3E2#W#fe^_+M zu^12oVjyN1kp9O^Qpyl9AO^%hSPV%2!=g)$#ef(P12MyZ^gm{jQig~DF(3xQVnF&I z7F}{I2E>3Eh#3Z?|1p!4GDHlB0WlC31JeJn=#pbGAO^%h%rGGRkC~*DA!0xbh=H&e zkp727mmG@$F(3wFh5_k+%p|1@5d&gC41~pi^gk@R8muNhw3bfEW-1VKE^64~s5276W2H48#lr(*KxAN*N*s z#DEwGivj6>Saiv;7!U(uAZ8el{>MyG$`COi2E;&E3`qaOqDzj&fEW-1F~flLKW36r zhKK<%AO^x>K>8mRU2-f2#DExx83v^PF_V-sL=1=lF%T95(*LmN%H8qWH7jwzes$_U zY^kgMX1{jrst+Do~6*Htkf2E@RDWI*~~uxz9A>(`}j-?nkbcD7C1wr(JNiGRV8 ztkA@O7$^`1r2hp%qYcE0|G)qI`N_v0uzmF5d$VSKX2oAsKmqcspu~U}C=dpu{{=#$ z4FobbZ(6_b`|m1Ou2P{w#gZjU)u>T}lU$Yl7a-3HN(_jB0%1VvK>*$#c{|b?`y%5nT!5&MXJ;w*;AZwJkuRkW9g9wcBhm%ipyxVcWmJ8cnJoDo+GZ211Wgk18z zdLejc{C>TYo--YYf&I+@IdJ>-t&ZdD?;C{`zE>8A-rj|jdskQfo3XSqc!t0|FSuvp z4C`>%r@CM5RT~;7Ibh?!9$QUj?*+nnzZdJ#yUIbfS8$STWpmorEgLp& zSep@`8v!p^dsAH6xN)8+`M+8RT(u_&a6{S}_TCwPP&m$g^BjBTd-wHw%*7d(>{S~9 z|FK7=jkCRlcTD`fU_(w?qewf?DC@ww=Q!KV3k_eEG+zWv6_C2hzMFuXr5ntM;u_L^~)uHMzw+1J8CZm_8$Xe#X_b3 zHs`aabf6Nb!H(^lckSG|ZR^Ig^{KYFp`x7GzHO6r204CT2<*-sTd?cz6~A+yWBjgO zZHg*o4PRVBf!VmqhGOFYU+gK5HYiIt$D1~;^R5z^T&@y0A!pJ5b5^Zf!M%tE@tO1O zS6_TSbIDIXU;@EX1EReH*8#m8I0m~HbFfy zIZ3oJ8KXG)*Pp*))IWd!Vb9<`ccW|^xR(uwIFQ6Pdu&KsP26z6tvLg+IHMx4)>)4@ zfJvgqtqH-pv{{RwJZ3m=-FtgH+MGu8h%jk^;k$Nh!2y@twA;)v2O+F-b=}%j?#SBU zfB*UG_uqc`^N;03$!7A^sTnO3dm|fF?Zv=;Vt_nfc=X>ZTviY$5tOVqUY~T=UHz`R zuEj+cH9Y_Py7lYTYudEQoqc*woA&00^{Xj^Zhz;8&t$G(; z(6Qs~tiz-rQ-pu{`KPvRS`)~H7hW*!ttp&DUV8Pq(;^wnYKcdWXo_ldqRN~1epSbwqi<&oYcF83dPo4S(0U{>;@&B{V zJd^wW@%t~mdiFT|^wXIXKJ?H-k3II-CXKKD@Wc106bQ$&nw#Q%%$QMEUwzdD7o1PX zx88couwg@f`FRQPF1@r7i;GS=>BO2fYhH21ZhL?3ccrFa6aB!wY+WQh~elS+Gx`bqrU#?^FDog zUDUAQ>8G8BQMGHI(X>h9Ax}K^+po*AvgAe2qmFyImEg9&SN!qL+f!3g?)%^WZoK&7 zi`}%okSKNU-gVNXS5i~|Ce`ds353X#I!k_9)T2i?H}V(YlH<4DdV}+G=6rS4RacTR zNc!HrdyzEucKG6eSk0XAX@?H&&po#e5higPH@=#*!(OrtwOG%dJz#^YFTVJ^cT+b4 z8)*_e`|QXTEt;Qs=9wp*bkeD(o?5Tog%J4hNAE*5X>DVeX9{{e=s*la#sHbQ@aR7# zkmo5u=-9X4eA%T-XG%x8a^*^uDpj^@*$NdZFr!+rVnutVT)Cu%4I9#&wuQ?WvTf5- zPd&-POWd>e2FGQ}lr34ZBwpIIX-$cyUSOo_kV6g$1Og}$4NgR(e*JnyiX@aTU%q(p z;wZdVRINOs?KmccgoK32lVA0;2*KI?-*$>W0dgbuXd3Na3a_XCM3{0t|GPQVL_~@E zGAT?IP_=4R%;Ap2feOnOc6+sI%@vrm;ORZ5pG%~gW1F0l(UPdMR(mtGtXOICHWT4c|_{NhCm`uFQg zok;k2bTdqOn_&pBRH@Qs%9J_#?6avIY!c%UNvBCrvu2HA#fp`$P~H$)u~M;O#ZNx@ zWHQF|&ps|uq6DE4d1?m?!Hc~eS80>rnqcvA=%FNORbtM4sYYS!uKxWX%I38D?!B9= z!bu3oZgc-FxrdX=Tfm z!y{KIj2xVN@=3=Wa||U5g{OIT=9#q#gwRTqDEZW|C;!_;V{0w#QXJ!@N|i%s(b2M9 z?G`->ER8W*;7Dw8ax%A`GUYYK5I9df+2ftIVwtjK-CL9!J9adkdQblkJM1v(pxU)- z)7NkdyD8?>SyV~HAER(krgWJ}6JJK$Z1a@{yzL-h8g96+D`FMC{9+bO%B@?{diUyy zQel_19Ybo>s&(9P#}OnH^l_1E$)H|1k$!`YwOqV5ld0~mtGog zd4S>0ojTfj&|YfSt}Qy5oP0EqKkTq-Q~_119tx#zO?@LhefL*i&V~o#7BngeoyT^b zl0fC?0?<$|TC^B`aYKSqm0*+`pK_c*S7Gd#;ltpT+xa8Lu~lR8%p-#yf(( zBRD2NFT-#SZ)JAtA9zlg8=kyU>5mxOu7K z)mL3<^`G)W4(ZXoYtf=bi5^!?I+iI@+Eg!PxFhWidJT;(xr7Wuwt<;@?zwBv?(N); zo5Bfhh{MrGn=DENLC1miKhefLxq@d#8Z@ZSm>xOPRskU<>#G(@Q+JPI9R#U;uM+EuUKh3Ei{x)#lwp%q+3X(_@CDKN;uXvxTt z!)w&2fjAs?*r8Tn37N`*DuagH6_;Pmy$GM0B}4yHP16gtLH}>L)HmPg*s%kJ9bvI|WVnN&V{S+M-+p^4 znqZX>Ww`048wsDmuAi6vl)h)@m}f^KCio&uuHJwDJ+yfLS@8!~pC32olv7T@YMR$H zJ87$X@ysQwSp6UI%+t_|2~gLjb*r58pFW7?0ZSQDv-(fq2zbK=4Tv@obB#Lu(ngoS zHnr07UzQT>mnXbH8rf(Qa|VApb?U%q9#$j2_uY37;t#!8L~0Iw@` znB-&`SjC|&}w2%3ereG|6vTn zC%p6mD$RL5gvP$Z?d_g^dMKl34I9)a6-ZxFpZbhFAyeIu(`Xn_zf-SOs#FPwMeW+Pu~JES#~HHjedOS^ji=|6Gfj>jHz3@u6GfZCw09MDrhH*EcdYK<5% z9Jyv@;mtQ+#}*U=(-{2Uv5m>!Z3rw@pc1qaVS}sGeZz+jqiKz#z!(nF(|0nnjDZYa zKl;eSUwt*3OI90m$r}Zt|JFHz%!lbU5G5r(2{*#ehO8-6YymQN!H}(=BH66_iDl)oA7*2ppJ#f{%(GlE;QGHUJ ztc%r*7LxaRteT+N zcJW0QF_}v-Vf>j99%Rrupw#e=ol{Reh2rTJ=Q7Vc^Asm*)vCoAq_tkXdYd<;A@Ai@0?2%{Ngapc}YHTL=U(;S&0G*`=56+G$3vHm0pXG&q2+ZCf|ta>w>9 zG+cM?+-kWZSHM_Kc?0*5u#fG30mxEu6F}YGVGvIM>pn=s0xg+zG9= zMxmM1!lZ@U^(fQ2bt_`N-kkxrMxpkksT;}dT*4zd8Dfsj59DF4GM#$H8E4R~!#j2{ zT*l0TwVgBMBKiVU5O4^Sef#!7kdf;fZ@ht4H@S#Q^uKUMp2=4K;d%SkjURsSE-sOG z%%yTSHT#|&u01NYC%(D3H!0}tHK*etCtH0SZh9;NgVRNnf3bA;nRj5g3B zMUjZJl_{HQd=1Ko89Ja1Nt2J3DpG0N|3?4Im19&K^}}RH;1bG-X|7b3WzvD+3mVUh z7A^SUhxu&gVBvR+nK1SI{epRHfBeDtMy9o_S@Vo5tp%z7sNy%@d_n49quo6H$0!=u z-oO!8sokz=+JwH0vA1m5!H*7Nu+AE_=!XSt)-;S-xbQn;)c5lk6%R4WDkOW<^2Z++ z{`tqRD8D7mpJA-JGOixf|1#aWbzXR|=yo&KyvRd80CAkOdO7pKUV?2}M{K zAVT38y+`>`RAwQW&g<5-^DD2s^wwKb=Y>qpsx2I|p&m?aP$^(6)e5x$ElC!U^6?)~wQ-4)Fmvy{ccX;d+LI>Y5VGwL{bv}8 zDwQ}}0z>q_Q^yY8G%gWiaVDKLY7LAc@_t)_ru`olS)*|I?C6o!D8C_IhS|7b4YdPC ztyyiJrDEeNsY!-`2d2fz81AqP8K88^{r3`+koG@WC8+;qk~gdVV-Bl2ZADBV4Aj`b ziH}WOZp4|9BcGwOL$q0xg?rlw8Red|DahU$%ieyhz03~ai#-{GsV+-nE-C2eLC0bs z0tU!qg-ZW1!7I_kjp9tZi*iRZp4J_U_G$exyibeK^rEX(LptbFvjmY&G-dY9DX&+r zUY$`^I^ZmoDZx>xQp_GQr-m1bKSeJm{inIka1-rP?n@jf%LJIA|L)*6?Q<$1E2?fu z-_||n`CsCO;GBK-Sv+v$wOR=lb)vCKvz8)HF(>jYab(gnME`Hftp6-S#UkR*hXX8P zwJHXBd(JTLNq`W-NroM$;9xHo^*D(7Z%1-G{f8?A(;7wO`B>99LtW+>#ZpykR7gG# z$uP7NXmYbLNXVlfclGbfz+l-jWnqRJkpVb<_no&$T&gPn_z$^@Y(~n`0LV%I@95Fp z(|;Cu`YTzRXVCvqqcZFN4$}$s^q&+Yr}_^iV|zHX<|{nFa$5?^UuqEd}!c+ z%P+sIW{nz1D^C!db=H{-FSEXaHYBZUO4;LC+W+(mXnVi^-aE9q5k|NrG#cKt!)bWi zF^ZR7e4f(nT}5G--Lq=Io@amj>wl0MZg=jvbqLB0+bdG!FTo5|m#SPP3<&aE6v}w{Bh5uQ%`c z!6Tb7XWJ&0(@=Jp_GQxTx8=)FS4M{h4tS8|@svvBk;VVJng4ZPJ7DL3t-`a~pQYMs zR%cAqZr-$xVn|`MwZj!xT#jOJFN!=Rof|PMLmco+WT>R55va55^IxRm{!M16;FvFgT3E0=7|W#@k_Mc()im(WRO@o?IC3BXVibu>oJm|F0H0JqtlDB; zfIP!}t^Q*O52`aqgL`rX`hU(j=WN}wiJV5-S$pw?`I9ETN`4?$KuFiFoo(mBvs4FS zpuia@Z2FIZL7yjFP!K&t z|Lqy}NCXmUhfpYkDE4cwO{6SSV9^tA{SRw8D7H-Sxr)j{%{>eUQDFI4al>Q3zJ1*K zg=IFo7tEhSQAG!c9+J!KfW5EPe|w&#fYczrE?>&<4xJRFfQ{K_t7$Ap#~fD36J}2O z4;$FBbjc!yh3((~H~ntjx4bg}W9^2L(4k-|0g4bk9yjKEoT9lkQkxa48xukv*Z^@rnA_jNn@ViW+52^L`1;GA^-Z%-LY%=FH4R;{y4kv zjJWY!7-sI-y^Uc*Qr14BS1u_Dqext<|70iATG+br`s-U*d1r0&(4kLYAW2~DC2+#2 zTlYLN2DKF-vyjOSM%fC^Z?BHUKtVA;DJWFq|F(cr92gwnaVN??MG4)bp-5Ri`>eBW zZ{Lo`op_W6&19q0U;=Ddwc({eIKKb>dzUT!k%pPoTcn;xqA0V-I9Cx;#)!}^Jo@z? z#YQ?PfH&N59jn%d4sU#*8_H5h#5MoVxAmx4@l|uZ9;W_7= zJ!{rxJeY}8_Pw(=%X^p+qj^agwCyFzBV`2h zZAqbB$754;50G4zRa|#ni#~mNvC&9F#?b`ik+qLX|`Q`_0@g*_F-OA0dR~Y}OzhGo ztYh&xaj=9~k2z(J$o0R&?ewRZUA^$a3rQdtK+iCf5AwzQJ88!nNNO|`J6rv?(W5!d z7~Og2oyWKwQct}`P*+}g1@-T_=kkn|sU-wfq@|ui@%HUdUsFN!3pVAY%|=+b1On$aN$xiB2s*1i0$y zKh+i+?U12EhhQvj;0^cZoP9Qv5#&$I$6hKga@r}UoJ<93BkZ?S$6}zs86Zm+D*d;` z*}qTAde{0O*j3Ia)e(>C54gF#0OzPMkt{S z4$$7iz0qoSzJLxp<1>_H`WFoEvkr&3IgagE3WXF2vH`yT-rLj~jN3B~ZX<+qn}@IQ z2-`f}Kx8n=+J$LUOhk-Ya^y(!P&3x$r2oVUtD%eeeFn_okYutwER5n938faURIw6W zCz8pgS5{kK1{wc+_6*z;4HAV(aCbfMZqCC7cd;3U14fFxaID81+W%(h*Jq!7WO*jB zsA`ESehGvTB&v9-00t+qmpj^wYAXU3DUs-ygP8mEAHQpu`rp2ZWxDwi8(?@L%qgdw zeEsz;=#2C7nfvd%m-jYPNGX*_3N-?YHwc+q&|et$+_Ur)XyLOA2|cV;v*!5c$08Rt zCUi`e{{#0{}!f6_@OU3Xn`=6umx-Yt(yYaM&Lc@(+qu~5ovynLSLg@-@=6k6oA zJj=Ff)pF|=-cC$?l;MWiXJUwwQzM^n!tuoXxZ|ob2|`nrfosbHM%jisF`*I8d_A-< z(naGxr}n=WJ>o;_edETpL?N&mPl@Qsi&dTumAR@_{AKiZAzblH_VOyF8Xg{%bvSfmu_fFzy4}AV`992h9)#;5L>R|#&ges zw-GU-1beu&d&oqrcw>)ktkRNi->xmQ-slf*JZHoczl6+?J}j|^9D561p3F!EKea$L zjrdbze)!?L{A9zAi@#@(gqUDvw(Wl#E6bIQGpte!jg4q z_nt-efV-GFcgbQT6!9kxw8`In=PgXL8HRy$#(yy9o3B5=v{56bb=;M)rVGi*Z1vyX z2nU|Yq_Q>Hi!*ru>-f0=_@6oBGaBtwWDtuXA?yFiuGr|AfH_!-U&!RS6><@k7f=40 zjJIM^fsVrcLegS270g24MmM zbtS`4^^YDk0@G+qFoZ>a4dJ!xKXcN58!^Ahq?+Z{i!(PRA*>s60uDGsHG$f5J0^NU z+W(d@>m9qSU-%>rM4qY&2b?4j?-m?r69fB|0rFd+(tnJlbRdQ=Pk51WS31{j6;R2( z{eUt>f0_0wuTo)%;Fx2QsRG=W!)Gjav4xa9cG*(q?fnJNQtEe%d%pv|+7MV$&Y1~TLe&DhFo|k(p@>NstCP}8-iL?inwmf(QdPbz|1wQ4dRPHDxE z{m0V&4jJ9Zv;Ds>aey53A3@`N%MeJFnx7BiNvB}Mh^Ntimb~()b(JcW`4zA>ZCcT* zBF;8S=s)u;cKj#*@Bi5sbNqfYGK@>$3**Pp_(%Dv&O$N_nTPHqODmbhV0L)3TS?`l z|CmMvM}S0|%FZ!m0n1G9rc~zCCR!ZBE*# zltdQZxcgFO&B*h$$^7>w*U8qJx`Ol9kF8Oh{f|% zMnoPPZH8e_zkoHmymEo*37io_ww(0eUSj@;j7L2}?dJXdm-K&M5=(vy8v|tZLe>6< zA6vxOQ=;hwFrL67dD?9JvPS>@efe!B-sed>lRT+O>Jgl-E!y z+OCYcTz~y_*iP!?E;#uNpFp5%`SPW__=1h!9nQ}-{OMu5D3$SZCf`|~L&brvGK9_} zcs~7KNm|%ROy)5eS|>$poH2@-bQVWp4(nc7e2YhF1kUF(+WBE^I8&(3`Z z3`ilv@Z21khs8XE#^Ai2E8*DW{B0Zg)c~T$NEeTd@T)6K7!f@-s)p&GeN30ZTq<3f zXVdcA{{L?~kDIXqmfk8Ihwk0Gy3@@WQ#1LrNY2E-fnlH!>OY3t63w0xO$-^{K{qJ8 z-kvk&YH+^H^KxKFEAYEu3)7uEIZN{ur9w`53A`0jOU1t5(cZ#)yN*4`eGlO{p#CK$ z{GJ+dqb^|2t89oZ0oskUKrOBCdg5O zte_eqdU&xJ#&b+;d0U8$9(KXMXBU@P!bG$&nR<=22i7#}H{|Z#SFREQD|-nOkL+0r z<%Q`v(}5T$Bn%Wz{f8W{ka9qwrsT4D2fU(<^PZbRLgv0i16vRZE2QjgYja-$^!xRQ zfoBc=qhFLhW5b@V!gbJ%FDNUHm8_#YkMiDIOOEfyIoEmbzTe{6b7XK2ZeUsR1Am#F>ufs zkp3U^Bv1szfEW-12ZaIY|3RT$Hx&b7Knxso2BiN7JqZ*6F(3xSz(HX^`hQSp*Gp{Akp3SO+I3SgAO^(1L1#evf6$Xa5fB4nKnxrd2BiN7 zg?8Ok42S_SaL^f${vY%tPz1z)7!U&og#qdRL7`nY6$4^G3>e+KrS{v-bD*REc* z^0!%E51szy;2B>%9@aMfOXHsXXJ0)2>E~v@KQ=<1`Q^i--v09bSJw}i^zVR4tHRpu zdu7$A_g1YpHJ#b=PrF=-0Wly34mtz-SO3?o`RCtdQ$DzT^1HXa_Fk*7w#n}r_mkdj z_42e<6W%iW{jp7WtM%k}Zs;-U{i_CTY&Ll1wS!ahzcm}Y>dJ>w?|5$2`gNwKBmF<< znOPAK17aX72KJx+6OD}VpG`X$x7p(E_rn0rJ`M_*@t^-R{$s|kw21*R5MK=JKmB*b z-&6sGdj8i;N&lU?`rlQnSEa6AnQHc7Z2X`&?$8yLdAd>5Cc)f!2Z+!C?Y{o z6$4^G48$P=(*HP2beSv$#DExp0qMW8ffx`2Vjw;lkp9PKy31-YAO^&M^j~>E42S_S z5T6W4|Kl^=WwjU(17bk>uRI_I#DExxPX?s_@tN+jS`3H*F(Ca{9uNa!Kn%nu1JeKa zOm|r=2E>3Ekp3$VhygJm2I7+e>3@8tyQ~%iVn7T?|CI;CfEW-1@yUSnKR(l4R*L~K zAO@uW$^&9R42XgFWI*~KpXn~E#ef(P1JZxx0Wly3#6Wy9ApMWebeGj)Kn#ch>A&)T z7!U(uAU+w8{>Nv!%W5$o2E>5$UwJ?bhygJWpA1O<<1^i5wHOcsVnF(@JRk8n_=`O3qfEW-1 z(tqUvF(3xSKzuSF{g2Odm(^lG42S{gzw&?>5CdW$J{gey$7j0BYB3-N#DJ&&9B2~* zVn7Utf%st{b=BY7wrz17XX}K z76W2H48#uu(*O8LYgsA=#DEy^Ga&ukDs)brD8w~hygzX(tkf2 zIu-+BKn%nW1JeKaNo!du2E>3E@G~I&_p_m6F(3xSK>RQu{g0otmZf4q42S_g1JZv# z8#)#PVn7VU4+GNw_(^M7Dh9-W81OS7{r9t>V=*8G#6bKoApMV@w3ek}Kn#chKLgT# zKN~t017bi7#18|~|M*F3StK76W2H48#uu(*O8LYgsA=#DEy^Ga&ukDs)brD8w~hygzX zA^Pv=lZAn6o$mbgc9NY-oNoN@>GaRyCbtuZI7OWxezSj+>GM@_wC}e^IZw= zuoQO>+MnP18=fj>+~y*u51jWT28R7)r-jpti2amHb)&ZSh$$uHpHoG1}{O^S^uBN{flwBILNR4 zhOeVOw_z1^nz{Gs>a>D^{?6sZpgni*$-P^_OGl2oS*O|V&Hcl*|8_!J^O1?QNw-j2 S!;4Ry2<~sn$`K|L^7(%)V&Z84 literal 0 HcmV?d00001 diff --git a/integrations/libreoffice/tests/test_files/xls/basic_tables_two_sheets.xls b/integrations/libreoffice/tests/test_files/xls/basic_tables_two_sheets.xls new file mode 100644 index 0000000000000000000000000000000000000000..9d6336968fbaaefac835d398015c11c57672036b GIT binary patch literal 46080 zcmeI)1y~l_y1?=IC_z!NTQM=PJ3$n?yA===MNq&l?C$Qu?rsIUyA!(=JGRPs*L>iH z*?;$O?|Yv6+D1*)sO^;#boDyEOmg|9cJmySh^AW7of( zt(?U+*4oa5C)6pgpZj+-3vHcMxO{5bU5OT2XLW6;;Nj=x_`fx|>1FURsWOC_0xIVxAh*8SbDxvCRN|1(#6rS$*39x8R}sJ>qG6jRT6%=Hjy^jtr2 z=bw1)pST^mwX_7iJnB}gPv-LcaCJ+L_DL3Uj)rJaYVHtXsn3&y*y!`5Ay)c4Sx7v6 zo;<`ls*Qdo43Rr`B6ZW3JG`4mkgu0x0}oF>?;&!-{-o|_vRVhnn!Z6nzW(hT8+rJ3 z^&TzP184aOH`z+aoo|dud!~7+UO{9twUuTQ>8NH>Zq>6;?MsDfuPS1e3u_4?RqtW1 zB(&8An54I18)kBqPggDDYq>J1{)Vl5Nn7bp*nX5Q$5*-YtKvq7SAbt@4_Q`oPx=1A z-a*09zj{T_+eS$T##Y-UYgKJ|*`|pdO7B+p+lUjp}my0t$Z_e^+=+Z z+(CAiw?g`7cZ;QUX{~#RNX(R7>ffX}n+C|5aI4(<7J2l~fqL5Hl2BzjNVmcIXAar^ z)k`%!a&d~aoKu$1JM}&1)c2K3u2{)hI@LEd?oS;>qp7~D(aa}K-2JJ(tudXsuHvRM z*JE5W*I-;zePiSPRBJby>N^`vwGyM5n`+!VRA1YePW8QwmKsd;&5h|)U)^Y`?`|~J zmp7W~+Z#>w^^K{kX^h+E22*1mV>)LrH6}8qQ)48fY9idvtNzyc*LP)2T6@(bSmFXle{-G&LqPni?Y- zO^q3i<_cCG%nhspSVb^*Ff~RsmZdUS6)-iXHEvrCOpSSs>1u$fF|je78Y3I6Hdq}n zHKsOhtH#(yQ)6zU>3tT%y`v$dYXsI9tO;0CFmtabZaZxb=~{rb1ZxG>8mtYN2bd?A z7ntf(80*v<%m=I;SbH#Eunu4y!Ti8Ff%$_4fOQ7z0u~4s1Qra|6|5Urcd#B{<|j09 z+gdM3*Bh)4SYNPyVEw^Dzy^Q~1RDf47;FgGP_SWO!@)*?jRYG7HX3XU*jTV}VB^6i zfK3FO1U4CL3fNSzX<*aAW`NBEn*}x-Y!295uz6tf!4`lm1X~0a3bq(*3D{DwWnjy} zR)DPpTLrcnYz^32uytVT!8U+x1lt6*8EgyKRRI|X(c><-vnuzO(l!5)A;1bYPb80-nyQ?O@X&%s`Ry##v&776wm>R>g%YJ$}Qs|{8M ztS(qRu=-#Pz#4)z0&5J`1gt4oGqC1hEx=lWwE}Am)&|T2%oEHDtSy)~m=9Pxu=ZfS zU>(3Zg86}U0`mt80P76a1uPIO2rL+^D_A$M?qEH@dV=)=>kZZitS?wUu>N2nU<1Gg zf(-&23^oL8DA+Kt;b0@cMuLq38x1xFY%JI~u<>9Mz$SuC0-Fpr1#BwVG_dJlGr(qo z%>tVZHV14j*gUZLU<<$&f-M3I1zQZZ1Z*kTGO*=fE5KHQtpZyOwgzl1*gCNFU>m?T zf^7oZ47LSqE7&%$?O;2=c7p8!+YPn{Y%f?CSUA`|u>D{Mzz%{P0y_+L1nel-F|gxc zC%{gEodP=zb_VP$*g3HCU>CqHf?Wc;40Z+VD%dr!>tHv)Zi3wcyA2irb_eV(*gdfO zU=P3^f;|Fz4E6-iv)WO_6F=N*gLTIU?0Fff_(z}4E6==H?Xf@ z-@v|uMS#7|a1I z30P9FWMIj`Qh=odO9hr1EDcy%uykPQ!7_kl1j_{G2$mTv3s_dLY+%{Ja)3F3{pVRvN4fSXr=gU@l;;VCBKw zz$$=M1ak+g1XdZW3RqRJYGBpDYJk-Qs|8jYtPWURuzFzi!5V-y1ZxD=7_13cQ?O=W z>VGU7{~fFaSWB>0V6DO0fO&vju^xtOr<6uwG!j!TNyp1?vaaA1nlH0N6mVL12TyhJXzP8wNHUYy{Xy zuu)*6!N!1%1sex89&7^GM6gLAM60wL9jz$hry139R)iEb{y;k*h#QcV5h;(fSm<92X-Fp z0@y{cOJJA5u7F(yy9Rb0>;~9Puv=ia!6Lx!fZYYV2X-Iq0oX&ZM_`Y^o`5|Cdj|F# z>;>3MuvcJ_V6Va6fV~BK2lgK91K3BfPhg+HzJUD(_7&_K*mtlfF#XTF4bS>amVX@g zSb$lAS%F!D*?`3ZvjwvQiw~9n%pNQuSR$~*U=CnOz>u*_gtz_Nm61IrGU1I!65Cs;19++caYoWb&fi1(qLu4%7T>xa{+S&D-Y%dRspOcm^)Y{u*zUnz^Z~(1FH^J z1FR-kEwI{Pb-?O^)dQ;!)&Q&_SR=5;U`@c9f;9tc4%Py!C0Hx4)?jVGJit7`yujLm zd4u_YwF7Gp<_p#VtRt8oSSK)lumG^mU|qlh!Ggeo!McKV1M3df1FR=lFRn*bpCD>}7Du*+aqz^;N_1G^4(1MDW)EwI~Q z5ny+~?t>=1AgFTlwL|3YS^-XClkSwMbItXW0o z)ZH%9LFUHELW{3&r(Qs6Ei21eB-OUqs8p^JW%PI8skaKt{uD-MAoF(e%D{Fi$LL93 z->2S^s(y9 z!ODP@1uF;U0_F-<9?T7_0$4>bcd$xemBFfjRRyaCRvoMcSWU26V70;OfYk-72UZ`f z0a!z@MqrJ>nt(M0YX;UFtOZz0uvTEL!Pfwcwm2J-=H2i6|U7pwzVM=(FI zPGJ6E0brfMx_||O1%U;Fbp`7N)*Y+|SWmECV7;%|Juv1{C z!Ono41v>|J9_#|xMX*a?m%*-pT?M-ab{*^n*iEooV7I{{!0v$E1-l1!AM63xL$F6+ zkHMaRJq3FP_8ja5*h{ciV3A<2!QOzq1$zhf9_$0yN3c&|pTWL>{RZ|G>>Jp3uqZI| zquaR8y-X&RFs@mES%O)CS%cYt#RIbivjd9{mH^BiEFoATu*6^vU`fD|f+Yh>4weEe zC0Hu3)L?1A(t@P}OAnR-EF)MZFh{V=U|GPjf@K5C4weJV2`ndAF0kBSdBB{(@`B|9 z%MVrntRPq+u)<(Pz>0zu11kXCHu!>;r zV3oisgH-{m3RVrQI#>;`nqalSYJ=4Qs|!{StUg!+u!dlbz#4-!0c#4@46Hd=3$T`8 zt-xA?wE^=0^91t(YYXNL<^$FatUZ`7SO>6kHNotUp)?*Z{DBV1vL0gAD;23N{REIM@iVkzk|1MuUw38w)lL zY&_Tmu!&%kz$SxD0hkPWHV%C5G)Z`VlW4=Bw$Iwl7S@$O97S=EEQO4ury$4!P0@H2g?AK5iAp!BUomz zEMQr|vVmm>%K_#DmJ=)&SZ=U9V9sE9!SaFS2P*(p5UdbbVXz`#MZt=J6$dK;RuZff zSZS~_U}eF|fw_RWf|Unz1FHa55zHN|5?E!hDqvN?s)1Dps{vLMtQJ^pusUFM!Rmq4 z2WtS<5UdecW3VP*O~IOhH3w?}))K50SZlC0U>;zeU|wKt!MwqIz}kVe2lEB%0M-%A z53CcIKUe@*XRt0{fnY&k!C+m%x`A~E>jBmitQS~sus&dY!TN#q2MYlk05%Y85ZGX_ zAz(wnhJg(S8v!;FY!ujNurXj`!N!4&2b%yk5o{9JWUwh=xK=jrf*lVyiU~j?RfxQR&0QM2=6WC|4FJQlceFggl_8lw= z%=`z|rS&I+F@O7Ovi##SK?^WTFe@-?FdMLVV76d(VDZ5cfZ2m31WN>#7|a1I30P9F zWMIj`Qh=odO9hr1EDcy%uykPQ!7_kl1j_{G2$mTv3s_dLY+%{Ja)3F3{pVRvN4fSXr=gU@l;;VCBKwz$$=M z1ak+g1XdZW3RqRJYGBpDYJk-Qs|8jYtPWURuzFzi!5V-y1ZxD=7_13cQ?O=W&B0oL zwFGMg)*7r0mjl;utPfaUuzq0u!9u_WfDHs21U49K2-r}tVPM0-Mu3e38wEBRYz){~ zuyJ7H!6txB1e*jl8EgvJRIq7a)4^td%>2LZ|kC!x8iNF7*KedN{rk~Y-dYt!N-ixThN|sg4tL1pnb6Nan`Y)~llN@c!`08zY zQBe-I`eR=$Opd)wmD0&aO<%9TfS>@MU`O{JUfzC=C5z{6;Nj`*=jSai?9((iUmu?m zD)HZ)r2UyQ{hK@ZE`O?Sf?cR~9Wi9yo6oh|JFJ{wtL1Upc(s679|=`(wO$q6PQq0} zsz9qDkwBt}guO&tiG&iJB~&4w$e+(=j9ymza+QdgTw@kw{n-rFm8Znt`2Onr501|NKQa5O^{?))|A~BK_VGW-|6d_Lm~#LC literal 0 HcmV?d00001 From a02cc9c019ff6fd57ffedea7a4a3e785e07ca377 Mon Sep 17 00:00:00 2001 From: Max Swain <89113255+maxdswain@users.noreply.github.com> Date: Tue, 24 Mar 2026 10:10:41 +0000 Subject: [PATCH 02/10] Address review comments --- .github/workflows/libreoffice.yml | 8 +- .../libreoffice/pydoc/config_docusaurus.yml | 4 +- integrations/libreoffice/pyproject.toml | 14 ++- .../converters/libreoffice/converter.py | 98 ++++++++++++++----- 4 files changed, 89 insertions(+), 35 deletions(-) diff --git a/.github/workflows/libreoffice.yml b/.github/workflows/libreoffice.yml index 869d00ea79..18c114c7f8 100644 --- a/.github/workflows/libreoffice.yml +++ b/.github/workflows/libreoffice.yml @@ -31,7 +31,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, windows-latest, macos-latest] - python-version: ["3.10", "3.13"] + python-version: ["3.10", "3.14"] steps: - name: Support longpaths @@ -39,10 +39,10 @@ jobs: working-directory: . run: git config --system core.longpaths true - - uses: actions/checkout@v6 + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v6 + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: ${{ matrix.python-version }} @@ -86,6 +86,6 @@ jobs: if: failure() && github.event_name == 'schedule' runs-on: ubuntu-slim steps: - - uses: deepset-ai/notify-slack-action@v1 + - uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1 with: slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }} diff --git a/integrations/libreoffice/pydoc/config_docusaurus.yml b/integrations/libreoffice/pydoc/config_docusaurus.yml index e9efad1d32..e2948fbbcb 100644 --- a/integrations/libreoffice/pydoc/config_docusaurus.yml +++ b/integrations/libreoffice/pydoc/config_docusaurus.yml @@ -7,7 +7,7 @@ processors: documented_only: true skip_empty_modules: true renderer: - description: Haystack 2.x component to convert files using LibreOffice + description: LibreOffice integration for Haystack id: integrations-libreoffice filename: libreoffice.md - title: LibreOffice File Converter + title: LibreOffice diff --git a/integrations/libreoffice/pyproject.toml b/integrations/libreoffice/pyproject.toml index f979c41c93..7bb96b0b19 100644 --- a/integrations/libreoffice/pyproject.toml +++ b/integrations/libreoffice/pyproject.toml @@ -5,12 +5,12 @@ build-backend = "hatchling.build" [project] name = "libreoffice-haystack" dynamic = ["version"] -description = "Haystack 2.x component to convert files using LibreOffice." +description = "LibreOffice integration for Haystack" readme = "README.md" requires-python = ">=3.10" license = "Apache-2.0" keywords = [] -authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }, {name = "Max Swain"}] +authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }, { name = "Max Swain" }] classifiers = [ "License :: OSI Approved :: Apache Software License", "Development Status :: 4 - Beta", @@ -19,6 +19,7 @@ classifiers = [ "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", ] @@ -82,6 +83,13 @@ select = [ "ARG", "B", "C", + "D102", # Missing docstring in public method + "D103", # Missing docstring in public function + "D205", # 1 blank line required between summary line and description + "D209", # Closing triple quotes go to new line + "D213", # summary lines must be positioned on the second physical line of the docstring + "D417", # Missing argument descriptions in the docstring + "D419", # Docstring is empty "DTZ", "E", "EM", @@ -130,7 +138,7 @@ ban-relative-imports = "parents" [tool.ruff.lint.per-file-ignores] # Tests can use magic values, assertions, relative imports, and don't need type annotations -"tests/**/*" = ["PLR2004", "S101", "TID252", "ANN"] +"tests/**/*" = ["D", "PLR2004", "S101", "TID252", "ANN"] [tool.coverage.run] source = ["haystack_integrations"] diff --git a/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py b/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py index 4106978098..1c5f51138f 100644 --- a/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py +++ b/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py @@ -86,9 +86,38 @@ class LibreOfficeFileConverter: "ppt": frozenset(["pdf", "pptx", "odp", "html", "png", "jpg"]), "odp": frozenset(["pdf", "pptx", "ppt", "html", "png", "jpg"]), } + """A non-exhaustive mapping of supported conversion types by this component. + See https://help.libreoffice.org/latest/en-GB/text/shared/guide/convertfilters.html for more information.""" - def __init__(self) -> None: - """Check whether soffice is installed.""" + def __init__( + self, + output_file_type: Literal[ + "doc", + "docx", + "odt", + "rtf", + "txt", + "html", + "xlsx", + "xls", + "ods", + "csv", + "pptx", + "ppt", + "odp", + "epub", + "png", + "jpg", + ] + | None = None, + ) -> None: + """ + Check whether soffice is installed. + + :param output_file_type: + Target file format to convert to. Must be a valid conversion target for + each source's input type — see :attr:`SUPPORTED_TYPES` for the full mapping. + """ soffice_path = shutil.which("soffice") if soffice_path is None: msg = """LibreOffice (soffice) is required but not installed or not in PATH. @@ -97,6 +126,7 @@ def __init__(self) -> None: raise FileNotFoundError(msg) self.soffice_path = soffice_path + self.output_file_type = output_file_type def to_dict(self) -> dict[str, Any]: """ @@ -127,12 +157,12 @@ def _get_conversion_args( :param source: Source file path. :param output_directory: Output directory to save converted files to. - :param output_file_type: Target file format extension (e.g. ``"pdf"``). - :returns: Tuple of ``(output_path, soffice_args)`` where ``output_path`` is the - expected path of the converted file and ``soffice_args`` is the list of - arguments to pass to ``soffice``. - :raises FileNotFoundError: If ``source`` does not exist. - :raises OSError: If ``output_directory`` does not exist or is not writable. + :param output_file_type: Target file format extension (e.g. `"pdf"`). + :returns: Tuple of `(output_path, soffice_args)` where `output_path` is the + expected path of the converted file and `soffice_args` is the list of + arguments to pass to `soffice`. + :raises FileNotFoundError: If `source` does not exist. + :raises OSError: If `output_directory` does not exist or is not writable. """ source_path = Path(source) output_path = Path(output_directory) @@ -164,12 +194,12 @@ def _validate_args(self, output_file_type: str, input_file_type: str | None = No :param output_file_type: Target file format extension to convert to. :param input_file_type: Source file format extension. If provided, validates that - it is a supported input type and that ``output_file_type`` is a valid conversion + it is a supported input type and that `output_file_type` is a valid conversion target for it. - :raises ValueError: If ``input_file_type`` is not in :attr:`SUPPORTED_TYPES`, or if - ``output_file_type`` is not a valid conversion target for the given ``input_file_type``. + :raises ValueError: If `input_file_type` is not in :attr:`SUPPORTED_TYPES`, or if + `output_file_type` is not a valid conversion target for the given `input_file_type`. """ - # Cannot validate conversion types if input conversions is not known - i.e., source is ``ByteStream`` + # Cannot validate conversion types if input conversions is not known - i.e., source is `ByteStream` if input_file_type is None: return @@ -206,28 +236,36 @@ def run( "epub", "png", "jpg", - ], + ] + | None = None, ) -> LibreOfficeFileConverterOutput: """ Convert office files to the specified output format using LibreOffice. :param sources: - List of sources to convert. Each source can be a file path (``str`` or - ``Path``) or a ``ByteStream``. For ``ByteStream`` sources, the input file - type cannot be inferred from the filename, so only ``output_file_type`` is + List of sources to convert. Each source can be a file path (`str` or + `Path`) or a `ByteStream`. For `ByteStream` sources, the input file + type cannot be inferred from the filename, so only `output_file_type` is validated (not the source type). :param output_file_type: Target file format to convert to. Must be a valid conversion target for each source's input type — see :attr:`SUPPORTED_TYPES` for the full mapping. + If set, it will override the `output_file_type` parameter provided during initialization. :returns: A dictionary with the following key: - - ``output``: List of ``ByteStream`` objects containing the converted file - data, in the same order as ``sources``. + - `output`: List of `ByteStream` objects containing the converted file + data, in the same order as `sources`. :raises FileNotFoundError: If a source file path does not exist. :raises OSError: If the internal temporary output directory is not writable. :raises ValueError: If a source's file type is not in :attr:`SUPPORTED_TYPES`, - or if ``output_file_type`` is not a valid conversion target for it. + or if `output_file_type` is not a valid conversion target for it, + or if `output_file_type` has not been provided anywhere. """ + if output_file_type is None and self.output_file_type is None: + msg = "output_file_type must be provided either during initialization or for this method" + raise ValueError(msg) + output_file_type = output_file_type or self.output_file_type + outputs: list[ByteStream] = [] with TemporaryDirectory() as tmpdir: for source in sources: @@ -272,7 +310,8 @@ async def run_async( "epub", "png", "jpg", - ], + ] + | None = None, ) -> LibreOfficeFileConverterOutput: """ Asynchronously convert office files to the specified output format using LibreOffice. @@ -280,22 +319,29 @@ async def run_async( This is the asynchronous version of the `run` method with the same parameters and return values. :param sources: - List of sources to convert. Each source can be a file path (``str`` or - ``Path``) or a ``ByteStream``. For ``ByteStream`` sources, the input file - type cannot be inferred from the filename, so only ``output_file_type`` is + List of sources to convert. Each source can be a file path (`str` or + `Path`) or a `ByteStream`. For `ByteStream` sources, the input file + type cannot be inferred from the filename, so only `output_file_type` is validated (not the source type). :param output_file_type: Target file format to convert to. Must be a valid conversion target for each source's input type — see :attr:`SUPPORTED_TYPES` for the full mapping. + If set, it will override the `output_file_type` parameter provided during initialization. :returns: A dictionary with the following key: - - ``output``: List of ``ByteStream`` objects containing the converted file - data, in the same order as ``sources``. + - `output`: List of `ByteStream` objects containing the converted file + data, in the same order as `sources`. :raises FileNotFoundError: If a source file path does not exist. :raises OSError: If the internal temporary output directory is not writable. :raises ValueError: If a source's file type is not in :attr:`SUPPORTED_TYPES`, - or if ``output_file_type`` is not a valid conversion target for it. + or if `output_file_type` is not a valid conversion target for it, + or if `output_file_type` has not been provided anywhere. """ + if output_file_type is None and self.output_file_type is None: + msg = "output_file_type must be provided either during initialization or for this method" + raise ValueError(msg) + output_file_type = output_file_type or self.output_file_type + outputs: list[ByteStream] = [] with TemporaryDirectory() as tmpdir: for source in sources: From 75adddd8698001d7b21918d06e6fc0b22e6f0ada Mon Sep 17 00:00:00 2001 From: Max Swain <89113255+maxdswain@users.noreply.github.com> Date: Tue, 24 Mar 2026 10:42:38 +0000 Subject: [PATCH 03/10] fix: Windows libreoffice CI install and mypy errors --- .github/workflows/libreoffice.yml | 2 +- .../components/converters/libreoffice/converter.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/libreoffice.yml b/.github/workflows/libreoffice.yml index 18c114c7f8..983579f02d 100644 --- a/.github/workflows/libreoffice.yml +++ b/.github/workflows/libreoffice.yml @@ -58,7 +58,7 @@ jobs: - name: Install LibreOffice headless (Windows) if: runner.os == 'Windows' - run: choco install libreoffice -y + run: choco install libreoffice-fresh -y - name: Install LibreOffice headless (macOS) if: runner.os == 'macOS' diff --git a/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py b/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py index 1c5f51138f..5ba2637b81 100644 --- a/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py +++ b/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py @@ -261,10 +261,10 @@ def run( or if `output_file_type` is not a valid conversion target for it, or if `output_file_type` has not been provided anywhere. """ - if output_file_type is None and self.output_file_type is None: + output_file_type = output_file_type or self.output_file_type + if output_file_type is None: msg = "output_file_type must be provided either during initialization or for this method" raise ValueError(msg) - output_file_type = output_file_type or self.output_file_type outputs: list[ByteStream] = [] with TemporaryDirectory() as tmpdir: @@ -337,10 +337,10 @@ async def run_async( or if `output_file_type` is not a valid conversion target for it, or if `output_file_type` has not been provided anywhere. """ - if output_file_type is None and self.output_file_type is None: + output_file_type = output_file_type or self.output_file_type + if output_file_type is None: msg = "output_file_type must be provided either during initialization or for this method" raise ValueError(msg) - output_file_type = output_file_type or self.output_file_type outputs: list[ByteStream] = [] with TemporaryDirectory() as tmpdir: From d0b8f8c5cee9e850bfb3dbeb80e2ef7d51b791e2 Mon Sep 17 00:00:00 2001 From: Max Swain <89113255+maxdswain@users.noreply.github.com> Date: Tue, 24 Mar 2026 10:50:58 +0000 Subject: [PATCH 04/10] fix: Windows libreoffice CI path --- .github/workflows/libreoffice.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/libreoffice.yml b/.github/workflows/libreoffice.yml index 983579f02d..6dce77232b 100644 --- a/.github/workflows/libreoffice.yml +++ b/.github/workflows/libreoffice.yml @@ -58,7 +58,9 @@ jobs: - name: Install LibreOffice headless (Windows) if: runner.os == 'Windows' - run: choco install libreoffice-fresh -y + run: | + choco install libreoffice -y + echo "C:\Program Files\LibreOffice\program" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - name: Install LibreOffice headless (macOS) if: runner.os == 'macOS' From bbe7a459667f191d7cca329fb95f2578624d75ae Mon Sep 17 00:00:00 2001 From: Max Swain <89113255+maxdswain@users.noreply.github.com> Date: Tue, 24 Mar 2026 11:02:29 +0000 Subject: [PATCH 05/10] fix: Linux libreoffice CI install --- .github/workflows/libreoffice.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/libreoffice.yml b/.github/workflows/libreoffice.yml index 6dce77232b..987628b9a1 100644 --- a/.github/workflows/libreoffice.yml +++ b/.github/workflows/libreoffice.yml @@ -54,7 +54,7 @@ jobs: - name: Install LibreOffice headless (Ubuntu) if: runner.os == 'Linux' - run: sudo apt-get update && sudo apt-get install -y libreoffice-common libreoffice-writer libreoffice-calc + run: sudo apt-get update && sudo apt-get install -y libreoffice-common libreoffice-writer libreoffice-calc libreoffice-impress - name: Install LibreOffice headless (Windows) if: runner.os == 'Windows' From bbbb7efee9849fc980f4db837f23767a320788b3 Mon Sep 17 00:00:00 2001 From: Max Swain <89113255+maxdswain@users.noreply.github.com> Date: Tue, 24 Mar 2026 11:13:20 +0000 Subject: [PATCH 06/10] fix: Windows libreoffice CI --- .github/workflows/libreoffice.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/libreoffice.yml b/.github/workflows/libreoffice.yml index 987628b9a1..a837ed06de 100644 --- a/.github/workflows/libreoffice.yml +++ b/.github/workflows/libreoffice.yml @@ -61,6 +61,8 @@ jobs: run: | choco install libreoffice -y echo "C:\Program Files\LibreOffice\program" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + Start-Process "C:\Program Files\LibreOffice\program\soffice.exe" -ArgumentList "--headless --norestore --nofirststartwizard" -Wait -PassThru + - name: Install LibreOffice headless (macOS) if: runner.os == 'macOS' From 4fc16635d6b5376bfbdad28ffbf00c4ba8a43219 Mon Sep 17 00:00:00 2001 From: Max Swain <89113255+maxdswain@users.noreply.github.com> Date: Tue, 24 Mar 2026 12:11:31 +0000 Subject: [PATCH 07/10] fix: ByteStream on windows --- .github/workflows/libreoffice.yml | 2 - .../converters/libreoffice/converter.py | 61 +++++++++++++------ 2 files changed, 43 insertions(+), 20 deletions(-) diff --git a/.github/workflows/libreoffice.yml b/.github/workflows/libreoffice.yml index a837ed06de..987628b9a1 100644 --- a/.github/workflows/libreoffice.yml +++ b/.github/workflows/libreoffice.yml @@ -61,8 +61,6 @@ jobs: run: | choco install libreoffice -y echo "C:\Program Files\LibreOffice\program" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append - Start-Process "C:\Program Files\LibreOffice\program\soffice.exe" -ArgumentList "--headless --norestore --nofirststartwizard" -Wait -PassThru - - name: Install LibreOffice headless (macOS) if: runner.os == 'macOS' diff --git a/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py b/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py index 5ba2637b81..b22c6a029c 100644 --- a/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py +++ b/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py @@ -5,6 +5,7 @@ import os import shutil import subprocess +import time from asyncio import create_subprocess_exec from collections.abc import Iterable from pathlib import Path @@ -271,14 +272,26 @@ def run( for source in sources: # Handle case where source is a `ByteStream` using tempfile if isinstance(source, ByteStream): - with NamedTemporaryFile(mode="wb") as f: - f.write(source.data) - - self._validate_args(output_file_type) - output_path, args = self._get_conversion_args(f.name, tmpdir, output_file_type) - - subprocess.run(args, check=True) # noqa: S603 - outputs.append(ByteStream(data=output_path.read_bytes())) + # `NamedTemporaryFile` behaves differently on windows and locks the file if `delete=True` + # this workaround uses a try-finally block to make sure the file is deleted while adding a + # retry mechanism as there if often a brief handle after the process finishes + with NamedTemporaryFile(mode="wb", delete=False) as f: + try: + f.write(source.data) + + self._validate_args(output_file_type) + tmp_path = f.name + output_path, args = self._get_conversion_args(tmp_path, tmpdir, output_file_type) + + subprocess.run(args, check=True) # noqa: S603 + outputs.append(ByteStream(data=output_path.read_bytes())) + finally: + for _ in range(10): + try: + os.unlink(tmp_path) + break + except PermissionError: + time.sleep(0.1) continue self._validate_args(output_file_type, str(source).split(".")[-1]) @@ -347,16 +360,28 @@ async def run_async( for source in sources: # Handle case where source is a `ByteStream` using tempfile if isinstance(source, ByteStream): - with NamedTemporaryFile(mode="wb") as f: - f.write(source.data) - - self._validate_args(output_file_type) - output_path, args = self._get_conversion_args(f.name, tmpdir, output_file_type) - - process = await create_subprocess_exec(*args) - # Wait for process to complete as only one instance of soffice can occur at once - await process.wait() - outputs.append(ByteStream(data=output_path.read_bytes())) + # `NamedTemporaryFile` behaves differently on windows and locks the file if `delete=True` + # this workaround uses a try-finally block to make sure the file is deleted while adding a + # retry mechanism as there if often a brief handle after the process finishes + with NamedTemporaryFile(mode="wb", delete=False) as f: + try: + f.write(source.data) + + self._validate_args(output_file_type) + tmp_path = f.name + output_path, args = self._get_conversion_args(tmp_path, tmpdir, output_file_type) + + process = await create_subprocess_exec(*args) + # Wait for process to complete as only one instance of soffice can occur at once + await process.wait() + outputs.append(ByteStream(data=output_path.read_bytes())) + finally: + for _ in range(10): + try: + os.unlink(tmp_path) + break + except PermissionError: + time.sleep(0.1) continue self._validate_args(output_file_type, str(source).split(".")[-1]) From 57a5151327c4b6bc7b699eda407f435a99d98a4c Mon Sep 17 00:00:00 2001 From: Max Swain <89113255+maxdswain@users.noreply.github.com> Date: Tue, 24 Mar 2026 14:19:15 +0000 Subject: [PATCH 08/10] Address comments --- .../converters/libreoffice/converter.py | 178 ++++++------------ 1 file changed, 62 insertions(+), 116 deletions(-) diff --git a/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py b/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py index b22c6a029c..554d938ad4 100644 --- a/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py +++ b/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py @@ -5,17 +5,35 @@ import os import shutil import subprocess -import time from asyncio import create_subprocess_exec from collections.abc import Iterable from pathlib import Path -from tempfile import NamedTemporaryFile, TemporaryDirectory -from typing import Any, ClassVar, Literal, TypedDict +from tempfile import TemporaryDirectory +from typing import Any, ClassVar, Literal, TypedDict, get_args from haystack import component, default_from_dict, default_to_dict from haystack.dataclasses import ByteStream from typing_extensions import Self +OUTPUT_FILE_TYPE = Literal[ + "doc", + "docx", + "odt", + "rtf", + "txt", + "html", + "xlsx", + "xls", + "ods", + "csv", + "pptx", + "ppt", + "odp", + "epub", + "png", + "jpg", +] + class LibreOfficeFileConverterOutput(TypedDict): output: list[ByteStream] @@ -92,25 +110,7 @@ class LibreOfficeFileConverter: def __init__( self, - output_file_type: Literal[ - "doc", - "docx", - "odt", - "rtf", - "txt", - "html", - "xlsx", - "xls", - "ods", - "csv", - "pptx", - "ppt", - "odp", - "epub", - "png", - "jpg", - ] - | None = None, + output_file_type: OUTPUT_FILE_TYPE | None = None, ) -> None: """ Check whether soffice is installed. @@ -200,7 +200,14 @@ def _validate_args(self, output_file_type: str, input_file_type: str | None = No :raises ValueError: If `input_file_type` is not in :attr:`SUPPORTED_TYPES`, or if `output_file_type` is not a valid conversion target for the given `input_file_type`. """ - # Cannot validate conversion types if input conversions is not known - i.e., source is `ByteStream` + # Validate specified output type is one of allow output file types + supported_output_types = get_args(OUTPUT_FILE_TYPE) + if output_file_type not in supported_output_types: + supported_types = ", ".join(supported_output_types) + msg = f"{output_file_type=} is not supported and must be one of type {supported_types}" + raise ValueError(msg) + + # Cannot further validate conversion types if input conversions is not known - i.e., source is `ByteStream` if input_file_type is None: return @@ -220,25 +227,7 @@ def _validate_args(self, output_file_type: str, input_file_type: str | None = No def run( self, sources: Iterable[str | Path | ByteStream], - output_file_type: Literal[ - "doc", - "docx", - "odt", - "rtf", - "txt", - "html", - "xlsx", - "xls", - "ods", - "csv", - "pptx", - "ppt", - "odp", - "epub", - "png", - "jpg", - ] - | None = None, + output_file_type: OUTPUT_FILE_TYPE | None = None, ) -> LibreOfficeFileConverterOutput: """ Convert office files to the specified output format using LibreOffice. @@ -262,8 +251,8 @@ def run( or if `output_file_type` is not a valid conversion target for it, or if `output_file_type` has not been provided anywhere. """ - output_file_type = output_file_type or self.output_file_type - if output_file_type is None: + resolved_output_file_type = output_file_type or self.output_file_type + if resolved_output_file_type is None: msg = "output_file_type must be provided either during initialization or for this method" raise ValueError(msg) @@ -272,30 +261,18 @@ def run( for source in sources: # Handle case where source is a `ByteStream` using tempfile if isinstance(source, ByteStream): - # `NamedTemporaryFile` behaves differently on windows and locks the file if `delete=True` - # this workaround uses a try-finally block to make sure the file is deleted while adding a - # retry mechanism as there if often a brief handle after the process finishes - with NamedTemporaryFile(mode="wb", delete=False) as f: - try: - f.write(source.data) - - self._validate_args(output_file_type) - tmp_path = f.name - output_path, args = self._get_conversion_args(tmp_path, tmpdir, output_file_type) - - subprocess.run(args, check=True) # noqa: S603 - outputs.append(ByteStream(data=output_path.read_bytes())) - finally: - for _ in range(10): - try: - os.unlink(tmp_path) - break - except PermissionError: - time.sleep(0.1) - continue - - self._validate_args(output_file_type, str(source).split(".")[-1]) - output_path, args = self._get_conversion_args(source, tmpdir, output_file_type) + tmp_path = Path(tmpdir) / "input" + tmp_path.write_bytes(source.data) + + self._validate_args(resolved_output_file_type) + output_path, args = self._get_conversion_args(tmp_path, tmpdir, resolved_output_file_type) + + subprocess.run(args, check=True) # noqa: S603 - ruff doesn't know the arguments have been validated + outputs.append(ByteStream(data=output_path.read_bytes())) + continue + + self._validate_args(resolved_output_file_type, str(source).split(".")[-1]) + output_path, args = self._get_conversion_args(source, tmpdir, resolved_output_file_type) subprocess.run(args, check=True) # noqa: S603 outputs.append(ByteStream(data=output_path.read_bytes())) @@ -306,25 +283,7 @@ def run( async def run_async( self, sources: Iterable[str | Path | ByteStream], - output_file_type: Literal[ - "doc", - "docx", - "odt", - "rtf", - "txt", - "html", - "xlsx", - "xls", - "ods", - "csv", - "pptx", - "ppt", - "odp", - "epub", - "png", - "jpg", - ] - | None = None, + output_file_type: OUTPUT_FILE_TYPE | None = None, ) -> LibreOfficeFileConverterOutput: """ Asynchronously convert office files to the specified output format using LibreOffice. @@ -350,42 +309,29 @@ async def run_async( or if `output_file_type` is not a valid conversion target for it, or if `output_file_type` has not been provided anywhere. """ - output_file_type = output_file_type or self.output_file_type - if output_file_type is None: + resolved_output_file_type = output_file_type or self.output_file_type + if resolved_output_file_type is None: msg = "output_file_type must be provided either during initialization or for this method" raise ValueError(msg) outputs: list[ByteStream] = [] with TemporaryDirectory() as tmpdir: for source in sources: - # Handle case where source is a `ByteStream` using tempfile + # Handle case where source is a `ByteStream` if isinstance(source, ByteStream): - # `NamedTemporaryFile` behaves differently on windows and locks the file if `delete=True` - # this workaround uses a try-finally block to make sure the file is deleted while adding a - # retry mechanism as there if often a brief handle after the process finishes - with NamedTemporaryFile(mode="wb", delete=False) as f: - try: - f.write(source.data) - - self._validate_args(output_file_type) - tmp_path = f.name - output_path, args = self._get_conversion_args(tmp_path, tmpdir, output_file_type) - - process = await create_subprocess_exec(*args) - # Wait for process to complete as only one instance of soffice can occur at once - await process.wait() - outputs.append(ByteStream(data=output_path.read_bytes())) - finally: - for _ in range(10): - try: - os.unlink(tmp_path) - break - except PermissionError: - time.sleep(0.1) - continue - - self._validate_args(output_file_type, str(source).split(".")[-1]) - output_path, args = self._get_conversion_args(source, tmpdir, output_file_type) + tmp_path = Path(tmpdir) / "input" + tmp_path.write_bytes(source.data) + + self._validate_args(resolved_output_file_type) + output_path, args = self._get_conversion_args(tmp_path, tmpdir, resolved_output_file_type) + + process = await create_subprocess_exec(*args) + # Wait for process to complete as only one instance of soffice can occur at once + await process.wait() + outputs.append(ByteStream(data=output_path.read_bytes())) + + self._validate_args(resolved_output_file_type, str(source).split(".")[-1]) + output_path, args = self._get_conversion_args(source, tmpdir, resolved_output_file_type) process = await create_subprocess_exec(*args) # Wait for process to complete as only one instance of soffice can occur at once From f0ed1e041b26809cef88902bd2578d66552e4f79 Mon Sep 17 00:00:00 2001 From: Max Swain <89113255+maxdswain@users.noreply.github.com> Date: Tue, 24 Mar 2026 14:22:06 +0000 Subject: [PATCH 09/10] Add accidentally removed continue --- .../components/converters/libreoffice/converter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py b/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py index 554d938ad4..9f57fac60d 100644 --- a/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py +++ b/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py @@ -329,6 +329,7 @@ async def run_async( # Wait for process to complete as only one instance of soffice can occur at once await process.wait() outputs.append(ByteStream(data=output_path.read_bytes())) + continue self._validate_args(resolved_output_file_type, str(source).split(".")[-1]) output_path, args = self._get_conversion_args(source, tmpdir, resolved_output_file_type) From 469132b2ff7af8f9e4f9fe2929460bc4c8b0da0c Mon Sep 17 00:00:00 2001 From: Max Swain <89113255+maxdswain@users.noreply.github.com> Date: Tue, 24 Mar 2026 14:23:24 +0000 Subject: [PATCH 10/10] Update supported output file types --- .../components/converters/libreoffice/converter.py | 1 + 1 file changed, 1 insertion(+) diff --git a/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py b/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py index 9f57fac60d..b828aed7b0 100644 --- a/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py +++ b/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py @@ -16,6 +16,7 @@ from typing_extensions import Self OUTPUT_FILE_TYPE = Literal[ + "pdf", "doc", "docx", "odt",