diff --git a/.github/labeler.yml b/.github/labeler.yml index 0929acebe7..59a97a2840 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -118,6 +118,11 @@ integration:lara: - any-glob-to-any-file: "integrations/lara/**/*" - any-glob-to-any-file: ".github/workflows/lara.yml" +integration:libreoffice: + - changed-files: + - any-glob-to-any-file: "integrations/libreoffice/**/*" + - any-glob-to-any-file: ".github/workflows/libreoffice.yml" + integration:llama_cpp: - changed-files: - any-glob-to-any-file: "integrations/llama_cpp/**/*" diff --git a/.github/workflows/libreoffice.yml b/.github/workflows/libreoffice.yml new file mode 100644 index 0000000000..987628b9a1 --- /dev/null +++ b/.github/workflows/libreoffice.yml @@ -0,0 +1,93 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / libreoffice + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "integrations/libreoffice/**" + - "!integrations/libreoffice/*.md" + - ".github/workflows/libreoffice.yml" + +defaults: + run: + working-directory: integrations/libreoffice + +concurrency: + group: libreoffice-${{ github.head_ref }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + +jobs: + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: ["3.10", "3.14"] + + steps: + - name: Support longpaths + if: matrix.os == 'windows-latest' + working-directory: . + run: git config --system core.longpaths true + + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install --upgrade hatch + - name: Lint + if: matrix.python-version == '3.10' && runner.os == 'Linux' + run: hatch run fmt-check && hatch run test:types + + - name: Install LibreOffice headless (Ubuntu) + if: runner.os == 'Linux' + run: sudo apt-get update && sudo apt-get install -y libreoffice-common libreoffice-writer libreoffice-calc libreoffice-impress + + - name: Install LibreOffice headless (Windows) + if: runner.os == 'Windows' + run: | + choco install libreoffice -y + echo "C:\Program Files\LibreOffice\program" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append + + - name: Install LibreOffice headless (macOS) + if: runner.os == 'macOS' + run: brew install --cask libreoffice + + - name: Run tests + run: hatch run test:cov-retry + + - name: Run unit tests with lowest direct dependencies + run: | + hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt + hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt + hatch run test:unit + + - name: Nightly - run tests with Haystack main branch + if: github.event_name == 'schedule' + run: | + hatch env prune + hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main + hatch run test:cov-retry + + + notify-slack-on-failure: + needs: run + if: failure() && github.event_name == 'schedule' + runs-on: ubuntu-slim + steps: + - uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1 + with: + slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }} diff --git a/README.md b/README.md index 7319a7bde8..eb91034b44 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta | [jina-haystack](integrations/jina/) | Connector, Embedder, Ranker | [![PyPI - Version](https://img.shields.io/pypi/v/jina-haystack.svg)](https://pypi.org/project/jina-haystack) | [![Test / jina](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/jina.yml) | | [langfuse-haystack](integrations/langfuse/) | Tracer | [![PyPI - Version](https://img.shields.io/pypi/v/langfuse-haystack.svg?color=orange)](https://pypi.org/project/langfuse-haystack) | [![Test / langfuse](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/langfuse.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/langfuse.yml) | | [lara-haystack](integrations/lara/) | Translator | [![PyPI - Version](https://img.shields.io/pypi/v/lara-haystack.svg)](https://pypi.org/project/lara-haystack) | [![Test / lara](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/lara.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/lara.yml) | +| [libreoffice-haystack](integrations/libreoffice/) | Converter | [![PyPI - Version](https://img.shields.io/pypi/v/libreoffice-haystack.svg)](https://pypi.org/project/libreoffice-haystack) | [![Test / libreoffice](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/libreoffice.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/libreoffice.yml) | | [llama-cpp-haystack](integrations/llama_cpp/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/llama-cpp-haystack.svg?color=orange)](https://pypi.org/project/llama-cpp-haystack) | [![Test / llama-cpp](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/llama_cpp.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/llama_cpp.yml) | | [llama-stack-haystack](integrations/llama_stack/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/llama-stack-haystack.svg?color=orange)](https://pypi.org/project/llama-stack-haystack) | [![Test / llama-stack](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/llama_stack.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/llama_stack.yml) | | [mcp-haystack](integrations/mcp/) | Tool | [![PyPI - Version](https://img.shields.io/pypi/v/mcp-haystack.svg?color=orange)](https://pypi.org/project/mcp-haystack) | [![Test / mcp](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/mcp.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/mcp.yml) | diff --git a/integrations/libreoffice/LICENSE.txt b/integrations/libreoffice/LICENSE.txt new file mode 100644 index 0000000000..6134ab324f --- /dev/null +++ b/integrations/libreoffice/LICENSE.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2023-present deepset GmbH + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/integrations/libreoffice/README.md b/integrations/libreoffice/README.md new file mode 100644 index 0000000000..92bea2c831 --- /dev/null +++ b/integrations/libreoffice/README.md @@ -0,0 +1,12 @@ +# libreoffice-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/libreoffice-haystack.svg)](https://pypi.org/project/libreoffice-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/libreoffice-haystack.svg)](https://pypi.org/project/libreoffice-haystack) + +- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/libreoffice/CHANGELOG.md) + +--- + +## Contributing + +Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md). diff --git a/integrations/libreoffice/pydoc/config_docusaurus.yml b/integrations/libreoffice/pydoc/config_docusaurus.yml new file mode 100644 index 0000000000..e2948fbbcb --- /dev/null +++ b/integrations/libreoffice/pydoc/config_docusaurus.yml @@ -0,0 +1,13 @@ +loaders: + - modules: + - haystack_integrations.components.converters.libreoffice.converter + search_path: [../src] +processors: + - type: filter + documented_only: true + skip_empty_modules: true +renderer: + description: LibreOffice integration for Haystack + id: integrations-libreoffice + filename: libreoffice.md + title: LibreOffice diff --git a/integrations/libreoffice/pyproject.toml b/integrations/libreoffice/pyproject.toml new file mode 100644 index 0000000000..7bb96b0b19 --- /dev/null +++ b/integrations/libreoffice/pyproject.toml @@ -0,0 +1,159 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "libreoffice-haystack" +dynamic = ["version"] +description = "LibreOffice integration for Haystack" +readme = "README.md" +requires-python = ">=3.10" +license = "Apache-2.0" +keywords = [] +authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }, { name = "Max Swain" }] +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = ["haystack-ai"] + +[project.urls] +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/libreoffice#readme" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/libreoffice" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + +[tool.hatch.version] +source = "vcs" +tag-pattern = 'integrations\/libreoffice-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/libreoffice-v[0-9]*"' + +[tool.hatch.envs.default] +installer = "uv" +dependencies = ["haystack-pydoc-tools", "ruff"] + +[tool.hatch.envs.default.scripts] +docs = ["haystack-pydoc pydoc/config_docusaurus.yml"] +fmt = "ruff check --fix {args}; ruff format {args}" +fmt-check = "ruff check {args} && ruff format --check {args}" + +[tool.hatch.envs.test] +dependencies = [ + "pytest", + "pytest-asyncio", + "pytest-cov", + "pytest-rerunfailures", + "mypy", + "pip", +] + +[tool.hatch.envs.test.scripts] +unit = 'pytest -m "not integration" {args:tests}' +integration = 'pytest -m "integration" {args:tests}' +all = 'pytest {args:tests}' +cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x {args:tests}' +types = "mypy -p haystack_integrations.components.converters.libreoffice {args}" + +[tool.mypy] +install_types = true +non_interactive = true +check_untyped_defs = true +disallow_incomplete_defs = true + +[tool.ruff] +line-length = 120 + +[tool.ruff.lint] +select = [ + "A", + "ANN", + "ARG", + "B", + "C", + "D102", # Missing docstring in public method + "D103", # Missing docstring in public function + "D205", # 1 blank line required between summary line and description + "D209", # Closing triple quotes go to new line + "D213", # summary lines must be positioned on the second physical line of the docstring + "D417", # Missing argument descriptions in the docstring + "D419", # Docstring is empty + "DTZ", + "E", + "EM", + "F", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow function calls in argument defaults (common Haystack pattern for Secret.from_env_var) + "B008", + # Ignore checks for possible passwords + "S105", + "S106", + "S107", + # Ignore complexity + "C901", + "PLR0911", + "PLR0912", + "PLR0913", + "PLR0915", + # Allow `Any` type - used legitimately for dynamic types and SDK boundaries + "ANN401", +] + +[tool.ruff.lint.isort] +known-first-party = ["haystack_integrations"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "parents" + +[tool.ruff.lint.per-file-ignores] +# Tests can use magic values, assertions, relative imports, and don't need type annotations +"tests/**/*" = ["D", "PLR2004", "S101", "TID252", "ANN"] + +[tool.coverage.run] +source = ["haystack_integrations"] +branch = true +parallel = false + +[tool.coverage.report] +omit = ["*/tests/*", "*/__init__.py"] +show_missing = true +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] + +[tool.pytest.ini_options] +addopts = "--strict-markers" +markers = [ + "integration: integration tests", +] +log_cli = true +asyncio_default_fixture_loop_scope = "function" diff --git a/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/__init__.py b/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/__init__.py new file mode 100644 index 0000000000..2d46328fd8 --- /dev/null +++ b/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from .converter import LibreOfficeFileConverter + +__all__ = ["LibreOfficeFileConverter"] diff --git a/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py b/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py new file mode 100644 index 0000000000..b828aed7b0 --- /dev/null +++ b/integrations/libreoffice/src/haystack_integrations/components/converters/libreoffice/converter.py @@ -0,0 +1,344 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import os +import shutil +import subprocess +from asyncio import create_subprocess_exec +from collections.abc import Iterable +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Any, ClassVar, Literal, TypedDict, get_args + +from haystack import component, default_from_dict, default_to_dict +from haystack.dataclasses import ByteStream +from typing_extensions import Self + +OUTPUT_FILE_TYPE = Literal[ + "pdf", + "doc", + "docx", + "odt", + "rtf", + "txt", + "html", + "xlsx", + "xls", + "ods", + "csv", + "pptx", + "ppt", + "odp", + "epub", + "png", + "jpg", +] + + +class LibreOfficeFileConverterOutput(TypedDict): + output: list[ByteStream] + + +@component +class LibreOfficeFileConverter: + """ + Component that uses libreoffice's command line utility (soffice) to convert files into various formats. + + ### Usage examples + + **Simple conversion:** + ```python + from pathlib import Path + + from haystack_integrations.components.converters.libreoffice import LibreOfficeFileConverter + + # Convert documents + converter = LibreOfficeFileConverter() + results = converter.run(sources=[Path("sample.doc")], output_file_type="docx") + print(results["output"]) # [ByteStream(data=b'...', meta={}, mime_type=None)] + ``` + + **Conversion pipeline:** + ```python + from pathlib import Path + + from haystack import Pipeline + from haystack.components.converters import DOCXToDocument + + from haystack_integrations.components.converters.libreoffice import LibreOfficeFileConverter + + # Create pipeline with components + pipeline = Pipeline() + pipeline.add_component("libreoffice_converter", LibreOfficeFileConverter()) + pipeline.add_component("docx_converter", DOCXToDocument()) + + pipeline.connect("libreoffice_converter.output", "docx_converter.sources") + + # Run pipeline and convert legacy documents into Haystack documents + results = pipeline.run( + { + "libreoffice_converter": { + "sources": [Path("sample_doc.doc")], + "output_file_type": "docx", + } + } + ) + print(results["docx_converter"]["documents"]) + ``` + """ + + SUPPORTED_TYPES: ClassVar[dict[str, frozenset[str]]] = { + # Documents + "doc": frozenset(["pdf", "docx", "odt", "rtf", "txt", "html", "epub"]), + "docx": frozenset(["pdf", "doc", "odt", "rtf", "txt", "html", "epub"]), + "odt": frozenset(["pdf", "docx", "doc", "rtf", "txt", "html", "epub"]), + "rtf": frozenset(["pdf", "docx", "doc", "odt", "txt", "html"]), + "txt": frozenset(["pdf", "docx", "doc", "odt", "rtf", "html"]), + "html": frozenset(["pdf", "docx", "doc", "odt", "rtf", "txt"]), + # Spreadsheets + "xlsx": frozenset(["pdf", "xls", "ods", "csv", "html"]), + "xls": frozenset(["pdf", "xlsx", "ods", "csv", "html"]), + "ods": frozenset(["pdf", "xlsx", "xls", "csv", "html"]), + "csv": frozenset(["pdf", "xlsx", "xls", "ods"]), + # Presentations + "pptx": frozenset(["pdf", "ppt", "odp", "html", "png", "jpg"]), + "ppt": frozenset(["pdf", "pptx", "odp", "html", "png", "jpg"]), + "odp": frozenset(["pdf", "pptx", "ppt", "html", "png", "jpg"]), + } + """A non-exhaustive mapping of supported conversion types by this component. + See https://help.libreoffice.org/latest/en-GB/text/shared/guide/convertfilters.html for more information.""" + + def __init__( + self, + output_file_type: OUTPUT_FILE_TYPE | None = None, + ) -> None: + """ + Check whether soffice is installed. + + :param output_file_type: + Target file format to convert to. Must be a valid conversion target for + each source's input type — see :attr:`SUPPORTED_TYPES` for the full mapping. + """ + soffice_path = shutil.which("soffice") + if soffice_path is None: + msg = """LibreOffice (soffice) is required but not installed or not in PATH. + +- Install instructions: https://www.libreoffice.org/get-help/install-howto/""" + raise FileNotFoundError(msg) + + self.soffice_path = soffice_path + self.output_file_type = output_file_type + + def to_dict(self) -> dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict(self) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> Self: + """ + Deserializes the component from a dictionary. + + :param data: + The dictionary to deserialize from. + :returns: + The deserialized component. + """ + return default_from_dict(cls, data) + + def _get_conversion_args( + self, source: str | Path, output_directory: str | Path, output_file_type: str + ) -> tuple[Path, list[str]]: + """ + Validate source file and return the soffice arguments for conversion. + + :param source: Source file path. + :param output_directory: Output directory to save converted files to. + :param output_file_type: Target file format extension (e.g. `"pdf"`). + :returns: Tuple of `(output_path, soffice_args)` where `output_path` is the + expected path of the converted file and `soffice_args` is the list of + arguments to pass to `soffice`. + :raises FileNotFoundError: If `source` does not exist. + :raises OSError: If `output_directory` does not exist or is not writable. + """ + source_path = Path(source) + output_path = Path(output_directory) + + # Source file must exist + if not source_path.is_file(): + msg = f"{source=} does not exist" + raise FileNotFoundError(msg) + + # Output directory must exist and be writable + if not output_path.is_dir() or not os.access(output_path, os.W_OK): + msg = f"{output_directory=} must exist and be writable" + raise OSError(msg) + + args = [ + self.soffice_path, + "--headless", + "--convert-to", + output_file_type, + "--outdir", + str(output_directory), + str(source), + ] + return (output_path / source_path.name).with_suffix(f".{output_file_type}"), args + + def _validate_args(self, output_file_type: str, input_file_type: str | None = None) -> None: + """ + Validate that the input and output file types are supported. + + :param output_file_type: Target file format extension to convert to. + :param input_file_type: Source file format extension. If provided, validates that + it is a supported input type and that `output_file_type` is a valid conversion + target for it. + :raises ValueError: If `input_file_type` is not in :attr:`SUPPORTED_TYPES`, or if + `output_file_type` is not a valid conversion target for the given `input_file_type`. + """ + # Validate specified output type is one of allow output file types + supported_output_types = get_args(OUTPUT_FILE_TYPE) + if output_file_type not in supported_output_types: + supported_types = ", ".join(supported_output_types) + msg = f"{output_file_type=} is not supported and must be one of type {supported_types}" + raise ValueError(msg) + + # Cannot further validate conversion types if input conversions is not known - i.e., source is `ByteStream` + if input_file_type is None: + return + + if input_file_type not in self.SUPPORTED_TYPES: + supported_types = ", ".join(self.SUPPORTED_TYPES) + msg = f"{input_file_type=} is not supported and must be one of type {supported_types}" + raise ValueError(msg) + + if output_file_type not in (output_types := self.SUPPORTED_TYPES[input_file_type]): + supported_types = ", ".join(output_types) + msg = ( + f"{output_file_type=} is not supported for {input_file_type=} and must be one of type {supported_types}" + ) + raise ValueError(msg) + + @component.output_types(output=list[ByteStream]) + def run( + self, + sources: Iterable[str | Path | ByteStream], + output_file_type: OUTPUT_FILE_TYPE | None = None, + ) -> LibreOfficeFileConverterOutput: + """ + Convert office files to the specified output format using LibreOffice. + + :param sources: + List of sources to convert. Each source can be a file path (`str` or + `Path`) or a `ByteStream`. For `ByteStream` sources, the input file + type cannot be inferred from the filename, so only `output_file_type` is + validated (not the source type). + :param output_file_type: + Target file format to convert to. Must be a valid conversion target for + each source's input type — see :attr:`SUPPORTED_TYPES` for the full mapping. + If set, it will override the `output_file_type` parameter provided during initialization. + :returns: + A dictionary with the following key: + - `output`: List of `ByteStream` objects containing the converted file + data, in the same order as `sources`. + :raises FileNotFoundError: If a source file path does not exist. + :raises OSError: If the internal temporary output directory is not writable. + :raises ValueError: If a source's file type is not in :attr:`SUPPORTED_TYPES`, + or if `output_file_type` is not a valid conversion target for it, + or if `output_file_type` has not been provided anywhere. + """ + resolved_output_file_type = output_file_type or self.output_file_type + if resolved_output_file_type is None: + msg = "output_file_type must be provided either during initialization or for this method" + raise ValueError(msg) + + outputs: list[ByteStream] = [] + with TemporaryDirectory() as tmpdir: + for source in sources: + # Handle case where source is a `ByteStream` using tempfile + if isinstance(source, ByteStream): + tmp_path = Path(tmpdir) / "input" + tmp_path.write_bytes(source.data) + + self._validate_args(resolved_output_file_type) + output_path, args = self._get_conversion_args(tmp_path, tmpdir, resolved_output_file_type) + + subprocess.run(args, check=True) # noqa: S603 - ruff doesn't know the arguments have been validated + outputs.append(ByteStream(data=output_path.read_bytes())) + continue + + self._validate_args(resolved_output_file_type, str(source).split(".")[-1]) + output_path, args = self._get_conversion_args(source, tmpdir, resolved_output_file_type) + + subprocess.run(args, check=True) # noqa: S603 + outputs.append(ByteStream(data=output_path.read_bytes())) + + return {"output": outputs} + + @component.output_types(output=list[ByteStream]) + async def run_async( + self, + sources: Iterable[str | Path | ByteStream], + output_file_type: OUTPUT_FILE_TYPE | None = None, + ) -> LibreOfficeFileConverterOutput: + """ + Asynchronously convert office files to the specified output format using LibreOffice. + + This is the asynchronous version of the `run` method with the same parameters and return values. + + :param sources: + List of sources to convert. Each source can be a file path (`str` or + `Path`) or a `ByteStream`. For `ByteStream` sources, the input file + type cannot be inferred from the filename, so only `output_file_type` is + validated (not the source type). + :param output_file_type: + Target file format to convert to. Must be a valid conversion target for + each source's input type — see :attr:`SUPPORTED_TYPES` for the full mapping. + If set, it will override the `output_file_type` parameter provided during initialization. + :returns: + A dictionary with the following key: + - `output`: List of `ByteStream` objects containing the converted file + data, in the same order as `sources`. + :raises FileNotFoundError: If a source file path does not exist. + :raises OSError: If the internal temporary output directory is not writable. + :raises ValueError: If a source's file type is not in :attr:`SUPPORTED_TYPES`, + or if `output_file_type` is not a valid conversion target for it, + or if `output_file_type` has not been provided anywhere. + """ + resolved_output_file_type = output_file_type or self.output_file_type + if resolved_output_file_type is None: + msg = "output_file_type must be provided either during initialization or for this method" + raise ValueError(msg) + + outputs: list[ByteStream] = [] + with TemporaryDirectory() as tmpdir: + for source in sources: + # Handle case where source is a `ByteStream` + if isinstance(source, ByteStream): + tmp_path = Path(tmpdir) / "input" + tmp_path.write_bytes(source.data) + + self._validate_args(resolved_output_file_type) + output_path, args = self._get_conversion_args(tmp_path, tmpdir, resolved_output_file_type) + + process = await create_subprocess_exec(*args) + # Wait for process to complete as only one instance of soffice can occur at once + await process.wait() + outputs.append(ByteStream(data=output_path.read_bytes())) + continue + + self._validate_args(resolved_output_file_type, str(source).split(".")[-1]) + output_path, args = self._get_conversion_args(source, tmpdir, resolved_output_file_type) + + process = await create_subprocess_exec(*args) + # Wait for process to complete as only one instance of soffice can occur at once + await process.wait() + + outputs.append(ByteStream(data=output_path.read_bytes())) + + return {"output": outputs} diff --git a/integrations/libreoffice/src/haystack_integrations/components/converters/py.typed b/integrations/libreoffice/src/haystack_integrations/components/converters/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/libreoffice/tests/__init__.py b/integrations/libreoffice/tests/__init__.py new file mode 100644 index 0000000000..c1764a6e03 --- /dev/null +++ b/integrations/libreoffice/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/libreoffice/tests/test_converter.py b/integrations/libreoffice/tests/test_converter.py new file mode 100644 index 0000000000..2111170067 --- /dev/null +++ b/integrations/libreoffice/tests/test_converter.py @@ -0,0 +1,129 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from collections.abc import Generator +from pathlib import Path +from unittest.mock import patch + +import pytest +from haystack.dataclasses import ByteStream + +from haystack_integrations.components.converters.libreoffice import LibreOfficeFileConverter + + +@pytest.fixture +def converter() -> LibreOfficeFileConverter: + return LibreOfficeFileConverter() + + +@pytest.fixture +def mock_converter() -> Generator[LibreOfficeFileConverter, None]: + with patch("shutil.which", return_value="/usr/bin/soffice"): + yield LibreOfficeFileConverter() + + +@pytest.fixture +def test_files_path() -> Path: + return Path("tests") / "test_files" + + +class TestLibreOfficeFileConverter: + def test_init(self, mock_converter: LibreOfficeFileConverter) -> None: + assert isinstance(mock_converter, LibreOfficeFileConverter) + assert isinstance(mock_converter.soffice_path, str) + + def test_init_raises_when_soffice_not_found(self) -> None: + with patch("shutil.which", return_value=None): + with pytest.raises(FileNotFoundError, match="LibreOffice"): + LibreOfficeFileConverter() + + def test_to_dict(self, mock_converter: LibreOfficeFileConverter) -> None: + data = mock_converter.to_dict() + assert data == { + "type": "haystack_integrations.components.converters.libreoffice.converter.LibreOfficeFileConverter", + "init_parameters": {}, + } + + def test_from_dict(self) -> None: + data = { + "type": "haystack_integrations.components.converters.libreoffice.converter.LibreOfficeFileConverter", + "init_parameters": {}, + } + with patch("shutil.which", return_value="/usr/bin/soffice"): + converter = LibreOfficeFileConverter.from_dict(data) + assert isinstance(converter.soffice_path, str) + + def test_run_unsupported_input_type(self, mock_converter: LibreOfficeFileConverter) -> None: + # .pdf is not a supported input type in SUPPORTED_TYPES + with pytest.raises(ValueError): + mock_converter.run(["test_file.pdf"], output_file_type="docx") + + def test_run_unsupported_output_type(self, mock_converter: LibreOfficeFileConverter) -> None: + # .doc -> .png is not a valid conversion + with pytest.raises(ValueError): + mock_converter.run(["test_file.doc"], output_file_type="png") + + def test_run_no_file(self, mock_converter: LibreOfficeFileConverter) -> None: + with pytest.raises(FileNotFoundError): + mock_converter.run(["nonexistent_file.doc"], output_file_type="docx") + + @pytest.mark.integration + def test_run(self, converter: LibreOfficeFileConverter, test_files_path: Path) -> None: + paths = [ + test_files_path / "doc" / "sample_doc.doc", + test_files_path / "ppt" / "sample_ppt.ppt", + test_files_path / "xls" / "basic_tables_two_sheets.xls", + ] + + results = converter.run(paths, output_file_type="pdf") + + output = results["output"] + assert len(output) == 3 + for stream in output: + assert isinstance(stream, ByteStream) + assert len(stream.data) > 0 + + @pytest.mark.integration + def test_run_bytestream_source(self, converter: LibreOfficeFileConverter, test_files_path: Path) -> None: + source_path = test_files_path / "doc" / "sample_doc.doc" + bytestream = ByteStream(data=source_path.read_bytes()) + + results = converter.run([bytestream], output_file_type="pdf") + + output = results["output"] + assert len(output) == 1 + assert isinstance(output[0], ByteStream) + assert len(output[0].data) > 0 + + @pytest.mark.asyncio + @pytest.mark.integration + async def test_run_async(self, converter: LibreOfficeFileConverter, test_files_path: Path) -> None: + paths = [ + test_files_path / "doc" / "sample_doc.doc", + test_files_path / "ppt" / "sample_ppt.ppt", + test_files_path / "xls" / "basic_tables_two_sheets.xls", + ] + + results = await converter.run_async(paths, output_file_type="pdf") + + output = results["output"] + assert len(output) == 3 + for stream in output: + assert isinstance(stream, ByteStream) + assert len(stream.data) > 0 + + @pytest.mark.asyncio + @pytest.mark.integration + async def test_run_async_bytestream_source( + self, converter: LibreOfficeFileConverter, test_files_path: Path + ) -> None: + source_path = test_files_path / "doc" / "sample_doc.doc" + bytestream = ByteStream(data=source_path.read_bytes()) + + results = await converter.run_async([bytestream], output_file_type="pdf") + + output = results["output"] + assert len(output) == 1 + assert isinstance(output[0], ByteStream) + assert len(output[0].data) > 0 diff --git a/integrations/libreoffice/tests/test_files/doc/sample_doc.doc b/integrations/libreoffice/tests/test_files/doc/sample_doc.doc new file mode 100644 index 0000000000..70b72d50d5 Binary files /dev/null and b/integrations/libreoffice/tests/test_files/doc/sample_doc.doc differ diff --git a/integrations/libreoffice/tests/test_files/ppt/sample_ppt.ppt b/integrations/libreoffice/tests/test_files/ppt/sample_ppt.ppt new file mode 100644 index 0000000000..b2a9c61935 Binary files /dev/null and b/integrations/libreoffice/tests/test_files/ppt/sample_ppt.ppt differ diff --git a/integrations/libreoffice/tests/test_files/xls/basic_tables_two_sheets.xls b/integrations/libreoffice/tests/test_files/xls/basic_tables_two_sheets.xls new file mode 100644 index 0000000000..9d6336968f Binary files /dev/null and b/integrations/libreoffice/tests/test_files/xls/basic_tables_two_sheets.xls differ