From 4690039665a0ff6f85422519efc520f82e055dc3 Mon Sep 17 00:00:00 2001 From: Zafer Kiziltoprak Date: Mon, 13 Apr 2026 14:22:56 +0200 Subject: [PATCH 1/6] feat: add Amazon Textract integration (#2391) Add AmazonTextractConverter component that extracts text from images and single-page PDFs using the AWS Textract synchronous API. Supports both DetectDocumentText (plain OCR) and AnalyzeDocument (tables, forms, signatures, layout) as well as natural-language queries. Includes CI workflow, unit/integration tests, pydoc config, and repo-level wiring (labeler, coverage comment, README). --- .github/labeler.yml | 5 + .github/workflows/CI_coverage_comment.yml | 1 + .github/workflows/amazon_textract.yml | 139 +++++ README.md | 1 + integrations/amazon_textract/CHANGELOG.md | 3 + integrations/amazon_textract/LICENSE.txt | 201 +++++++ integrations/amazon_textract/README.md | 134 +++++ .../pydoc/config_docusaurus.yml | 13 + integrations/amazon_textract/pyproject.toml | 178 ++++++ .../converters/amazon_textract/__init__.py | 7 + .../converters/amazon_textract/converter.py | 273 ++++++++++ .../converters/amazon_textract/errors.py | 11 + .../components/converters/py.typed | 0 .../amazon_textract/tests/__init__.py | 3 + .../tests/test_amazon_textract_converter.py | 510 ++++++++++++++++++ ...t_amazon_textract_converter_integration.py | 89 +++ .../tests/test_files/broken_image.png | Bin 0 -> 20 bytes .../tests/test_files/sample_text.png | Bin 0 -> 23511 bytes 18 files changed, 1568 insertions(+) create mode 100644 .github/workflows/amazon_textract.yml create mode 100644 integrations/amazon_textract/CHANGELOG.md create mode 100644 integrations/amazon_textract/LICENSE.txt create mode 100644 integrations/amazon_textract/README.md create mode 100644 integrations/amazon_textract/pydoc/config_docusaurus.yml create mode 100644 integrations/amazon_textract/pyproject.toml create mode 100644 integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/__init__.py create mode 100644 integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/converter.py create mode 100644 integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/errors.py create mode 100644 integrations/amazon_textract/src/haystack_integrations/components/converters/py.typed create mode 100644 integrations/amazon_textract/tests/__init__.py create mode 100644 integrations/amazon_textract/tests/test_amazon_textract_converter.py create mode 100644 integrations/amazon_textract/tests/test_amazon_textract_converter_integration.py create mode 100644 integrations/amazon_textract/tests/test_files/broken_image.png create mode 100644 integrations/amazon_textract/tests/test_files/sample_text.png diff --git a/.github/labeler.yml b/.github/labeler.yml index 349edf670c..4b2a08c6ac 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -14,6 +14,11 @@ integration:amazon-sagemaker: - any-glob-to-any-file: "integrations/amazon_sagemaker/**/*" - any-glob-to-any-file: ".github/workflows/amazon_sagemaker.yml" +integration:amazon-textract: + - changed-files: + - any-glob-to-any-file: "integrations/amazon_textract/**/*" + - any-glob-to-any-file: ".github/workflows/amazon_textract.yml" + integration:anthropic: - changed-files: - any-glob-to-any-file: "integrations/anthropic/**/*" diff --git a/.github/workflows/CI_coverage_comment.yml b/.github/workflows/CI_coverage_comment.yml index e4d682b7cf..03af909d75 100644 --- a/.github/workflows/CI_coverage_comment.yml +++ b/.github/workflows/CI_coverage_comment.yml @@ -6,6 +6,7 @@ on: - "Test / aimlapi" - "Test / amazon-bedrock" - "Test / amazon-sagemaker" + - "Test / amazon_textract" - "Test / anthropic" - "Test / arcadedb" - "Test / astra" diff --git a/.github/workflows/amazon_textract.yml b/.github/workflows/amazon_textract.yml new file mode 100644 index 0000000000..82d7394127 --- /dev/null +++ b/.github/workflows/amazon_textract.yml @@ -0,0 +1,139 @@ +# This workflow comes from https://github.com/ofek/hatch-mypyc +# https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml +name: Test / amazon_textract + +on: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - "integrations/amazon_textract/**" + - "!integrations/amazon_textract/*.md" + - ".github/workflows/amazon_textract.yml" + push: + branches: + - main + paths: + - "integrations/amazon_textract/**" + - "!integrations/amazon_textract/*.md" + - ".github/workflows/amazon_textract.yml" + +defaults: + run: + working-directory: integrations/amazon_textract + +concurrency: + group: amazon_textract-${{ github.head_ref || github.sha }} + cancel-in-progress: true + +env: + PYTHONUNBUFFERED: "1" + FORCE_COLOR: "1" + TEST_MATRIX_OS: '["ubuntu-latest", "windows-latest", "macos-latest"]' + TEST_MATRIX_PYTHON: '["3.10", "3.14"]' + +jobs: + compute-test-matrix: + runs-on: ubuntu-slim + defaults: + run: + working-directory: . + outputs: + os: ${{ steps.set.outputs.os }} + python-version: ${{ steps.set.outputs.python-version }} + steps: + - id: set + run: | + echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> $GITHUB_OUTPUT + echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> $GITHUB_OUTPUT + + run: + name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} + needs: compute-test-matrix + permissions: + contents: write + pull-requests: write + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: ${{ fromJSON(needs.compute-test-matrix.outputs.os) }} + python-version: ${{ fromJSON(needs.compute-test-matrix.outputs.python-version) }} + + steps: + - name: Support longpaths + if: matrix.os == 'windows-latest' + working-directory: . + run: git config --system core.longpaths true + + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Hatch + run: pip install --upgrade hatch + - name: Lint + if: matrix.python-version == '3.10' && runner.os == 'Linux' + run: hatch run fmt-check && hatch run test:types + + - name: Run unit tests + run: hatch run test:unit-cov-retry + + # On PR: posts coverage comment (directly on same-repo PRs; via artifact for fork PRs). On push to main: stores coverage baseline on data branch. + - name: Store unit tests coverage + id: coverage_comment + if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name != 'schedule' + uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40 + with: + GITHUB_TOKEN: ${{ github.token }} + COVERAGE_PATH: integrations/amazon_textract + SUBPROJECT_ID: amazon_textract + MINIMUM_GREEN: 90 + MINIMUM_ORANGE: 60 + + - name: Upload coverage comment to be posted + if: matrix.python-version == '3.10' && runner.os == 'Linux' && github.event_name == 'pull_request' && steps.coverage_comment.outputs.COMMENT_FILE_WRITTEN == 'true' + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + with: + name: coverage-comment-amazon_textract + path: python-coverage-comment-action-amazon_textract.txt + + - name: Run integration tests + run: hatch run test:integration-cov-append-retry + + - name: Store combined coverage + if: github.event_name == 'push' + uses: py-cov-action/python-coverage-comment-action@7188638f871f721a365d644f505d1ff3df20d683 # v3.40 + with: + GITHUB_TOKEN: ${{ github.token }} + COVERAGE_PATH: integrations/amazon_textract + SUBPROJECT_ID: amazon_textract-combined + MINIMUM_GREEN: 90 + MINIMUM_ORANGE: 60 + + - name: Run unit tests with lowest direct dependencies + if: github.event_name != 'push' + run: | + hatch run uv pip compile pyproject.toml --resolution lowest-direct --output-file requirements_lowest_direct.txt + hatch -e test env run -- uv pip install -r requirements_lowest_direct.txt + hatch run test:unit + + - name: Nightly - run unit tests with Haystack main branch + if: github.event_name == 'schedule' + run: | + hatch env prune + hatch -e test env run -- uv pip install git+https://github.com/deepset-ai/haystack.git@main + hatch run test:unit + + + notify-slack-on-failure: + needs: run + if: failure() && github.event_name == 'schedule' + runs-on: ubuntu-slim + steps: + - uses: deepset-ai/notify-slack-action@3cda73b77a148f16f703274198e7771340cf862b # v1 + with: + slack-webhook-url: ${{ secrets.SLACK_WEBHOOK_URL_NOTIFICATIONS }} diff --git a/README.md b/README.md index 42891b8360..6412d5370f 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ Please check out our [Contribution Guidelines](CONTRIBUTING.md) for all the deta | [aimlapi-haystack](integrations/aimlapi/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/aimlapi-haystack.svg)](https://pypi.org/project/aimlapi-haystack) | [![Test / aimlapi](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/aimlapi.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/aimlapi.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-aimlapi/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-aimlapi/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-aimlapi-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-aimlapi-combined/htmlcov/index.html) | | [amazon-bedrock-haystack](integrations/amazon_bedrock/) | Embedder, Generator, Ranker, Downloader | [![PyPI - Version](https://img.shields.io/pypi/v/amazon-bedrock-haystack.svg)](https://pypi.org/project/amazon-bedrock-haystack) | [![Test / amazon_bedrock](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_bedrock.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_bedrock.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-amazon_bedrock/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-amazon_bedrock/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-amazon_bedrock-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-amazon_bedrock-combined/htmlcov/index.html) | | [amazon-sagemaker-haystack](integrations/amazon_sagemaker/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/amazon-sagemaker-haystack.svg)](https://pypi.org/project/amazon-sagemaker-haystack) | [![Test / amazon_sagemaker](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_sagemaker.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_sagemaker.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-amazon_sagemaker/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-amazon_sagemaker/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-amazon_sagemaker-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-amazon_sagemaker-combined/htmlcov/index.html) | +| [amazon-textract-haystack](integrations/amazon_textract/) | Converter | [![PyPI - Version](https://img.shields.io/pypi/v/amazon-textract-haystack.svg)](https://pypi.org/project/amazon-textract-haystack) | [![Test / amazon_textract](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_textract.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/amazon_textract.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-amazon_textract/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-amazon_textract/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-amazon_textract-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-amazon_textract-combined/htmlcov/index.html) | | [anthropic-haystack](integrations/anthropic/) | Generator | [![PyPI - Version](https://img.shields.io/pypi/v/anthropic-haystack.svg)](https://pypi.org/project/anthropic-haystack) | [![Test / anthropic](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/anthropic.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/anthropic.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-anthropic/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-anthropic/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-anthropic-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-anthropic-combined/htmlcov/index.html) | | [arcadedb-haystack](integrations/arcadedb/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/arcadedb-haystack.svg)](https://pypi.org/project/arcadedb-haystack) | [![Test / arcadedb](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/arcadedb.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/arcadedb.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-arcadedb/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-arcadedb/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-arcadedb-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-arcadedb-combined/htmlcov/index.html) | | [astra-haystack](integrations/astra/) | Document Store | [![PyPI - Version](https://img.shields.io/pypi/v/astra-haystack.svg)](https://pypi.org/project/astra-haystack) | [![Test / astra](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/astra.yml/badge.svg)](https://github.com/deepset-ai/haystack-core-integrations/actions/workflows/astra.yml) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-astra/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-astra/htmlcov/index.html) | [![Coverage badge](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/deepset-ai/haystack-core-integrations/python-coverage-comment-action-data-astra-combined/endpoint.json&label=)](https://htmlpreview.github.io/?https://github.com/deepset-ai/haystack-core-integrations/blob/python-coverage-comment-action-data-astra-combined/htmlcov/index.html) | diff --git a/integrations/amazon_textract/CHANGELOG.md b/integrations/amazon_textract/CHANGELOG.md new file mode 100644 index 0000000000..522f6d5d8b --- /dev/null +++ b/integrations/amazon_textract/CHANGELOG.md @@ -0,0 +1,3 @@ +# Changelog + + diff --git a/integrations/amazon_textract/LICENSE.txt b/integrations/amazon_textract/LICENSE.txt new file mode 100644 index 0000000000..6134ab324f --- /dev/null +++ b/integrations/amazon_textract/LICENSE.txt @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2023-present deepset GmbH + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/integrations/amazon_textract/README.md b/integrations/amazon_textract/README.md new file mode 100644 index 0000000000..d829054361 --- /dev/null +++ b/integrations/amazon_textract/README.md @@ -0,0 +1,134 @@ +# amazon-textract-haystack + +[![PyPI - Version](https://img.shields.io/pypi/v/amazon-textract-haystack.svg)](https://pypi.org/project/amazon-textract-haystack) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/amazon-textract-haystack.svg)](https://pypi.org/project/amazon-textract-haystack) + +- [Changelog](https://github.com/deepset-ai/haystack-core-integrations/blob/main/integrations/amazon_textract/CHANGELOG.md) + +--- + +## Overview + +A [Haystack](https://haystack.deepset.ai/) integration for [AWS Textract](https://aws.amazon.com/textract/) that extracts text and structured data from documents using OCR. + +The `AmazonTextractConverter` component converts images and single-page PDFs into Haystack `Document` objects using the AWS Textract synchronous API. + +**Supported file formats:** JPEG, PNG, TIFF, BMP, and single-page PDF (up to 10 MB). + +## Installation + +```bash +pip install amazon-textract-haystack +``` + +## Usage + +### Basic text extraction + +Extract plain text from a document using `DetectDocumentText`: + +```python +from haystack_integrations.components.converters.amazon_textract import AmazonTextractConverter + +converter = AmazonTextractConverter() +results = converter.run(sources=["document.png"]) +documents = results["documents"] + +print(documents[0].content) +``` + +### Table and form analysis + +Use `AnalyzeDocument` to detect tables and forms by setting `feature_types`: + +```python +converter = AmazonTextractConverter(feature_types=["TABLES", "FORMS"]) +results = converter.run(sources=["invoice.png"]) + +documents = results["documents"] +raw_responses = results["raw_textract_response"] +``` + +Valid `feature_types` values: `"TABLES"`, `"FORMS"`, `"SIGNATURES"`, `"LAYOUT"`. + +### Natural-language queries + +Ask questions about a document and get extracted answers. The `QUERIES` feature type +is enabled automatically when you pass the `queries` parameter at runtime: + +```python +converter = AmazonTextractConverter() +results = converter.run( + sources=["medical_form.png"], + queries=["What is the patient name?", "What is the date of birth?"], +) + +documents = results["documents"] +raw_responses = results["raw_textract_response"] +``` + +Queries can be combined with `feature_types` for both structural and question-based extraction: + +```python +converter = AmazonTextractConverter(feature_types=["TABLES", "FORMS"]) +results = converter.run( + sources=["invoice.png"], + queries=["What is the total amount due?"], +) +``` + +### In a Haystack pipeline + +```python +from haystack import Pipeline +from haystack.components.preprocessors import DocumentCleaner +from haystack_integrations.components.converters.amazon_textract import AmazonTextractConverter + +pipeline = Pipeline() +pipeline.add_component("converter", AmazonTextractConverter()) +pipeline.add_component("cleaner", DocumentCleaner()) +pipeline.connect("converter.documents", "cleaner.documents") + +result = pipeline.run({"converter": {"sources": ["scan.png"]}}) +``` + +## AWS Credentials + +The component uses the standard boto3 credential chain. You can configure credentials in any of these ways: + +1. **Environment variables** (default): Set `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_DEFAULT_REGION`. +2. **AWS credentials file**: Configure via `~/.aws/credentials` and `~/.aws/config`. +3. **IAM role**: When running on AWS infrastructure (EC2, Lambda, ECS). +4. **Explicit parameters**: + +```python +from haystack.utils import Secret + +converter = AmazonTextractConverter( + aws_access_key_id=Secret.from_env_var("MY_AWS_KEY"), + aws_secret_access_key=Secret.from_env_var("MY_AWS_SECRET"), + aws_region_name=Secret.from_token("us-east-1"), +) +``` + +## Running Tests + +Unit tests (no AWS credentials needed): + +```bash +cd integrations/amazon_textract +hatch run test:unit +``` + +Integration tests (require AWS credentials and a test image at `tests/test_files/sample_text.png`): + +```bash +export AWS_ACCESS_KEY_ID=... +export AWS_SECRET_ACCESS_KEY=... +export AWS_DEFAULT_REGION=us-east-1 +hatch run test:integration +``` + +## Contributing + +Refer to the general [Contribution Guidelines](https://github.com/deepset-ai/haystack-core-integrations/blob/main/CONTRIBUTING.md). diff --git a/integrations/amazon_textract/pydoc/config_docusaurus.yml b/integrations/amazon_textract/pydoc/config_docusaurus.yml new file mode 100644 index 0000000000..bcfb7104ac --- /dev/null +++ b/integrations/amazon_textract/pydoc/config_docusaurus.yml @@ -0,0 +1,13 @@ +loaders: + - modules: + - haystack_integrations.components.converters.amazon_textract.converter + search_path: [../src] +processors: + - type: filter + documented_only: true + skip_empty_modules: true +renderer: + description: Amazon Textract integration for Haystack + id: integrations-amazon_textract + filename: amazon-textract.md + title: Amazon Textract diff --git a/integrations/amazon_textract/pyproject.toml b/integrations/amazon_textract/pyproject.toml new file mode 100644 index 0000000000..a90ff53fed --- /dev/null +++ b/integrations/amazon_textract/pyproject.toml @@ -0,0 +1,178 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "amazon-textract-haystack" +dynamic = ["version"] +description = "Haystack integration for AWS Textract document text extraction and analysis" +readme = "README.md" +requires-python = ">=3.10" +license = "Apache-2.0" +keywords = [ + "AWS", + "Textract", + "Haystack", + "OCR", + "PDF", + "Document Converter", +] +authors = [{ name = "deepset GmbH", email = "info@deepset.ai" }] +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", +] +dependencies = [ + "haystack-ai>=2.24.1", + "boto3>=1.42.84,<2", +] + +[project.urls] +Documentation = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/amazon_textract#readme" +Issues = "https://github.com/deepset-ai/haystack-core-integrations/issues" +Source = "https://github.com/deepset-ai/haystack-core-integrations/tree/main/integrations/amazon_textract" + +[tool.hatch.build.targets.wheel] +packages = ["src/haystack_integrations"] + +[tool.hatch.version] +source = "vcs" +tag-pattern = 'integrations\/amazon_textract-v(?P.*)' + +[tool.hatch.version.raw-options] +root = "../.." +git_describe_command = 'git describe --tags --match="integrations/amazon_textract-v[0-9]*"' + +[tool.hatch.envs.default] +installer = "uv" +dependencies = ["haystack-pydoc-tools", "ruff"] + +[tool.hatch.envs.default.scripts] +docs = ["haystack-pydoc pydoc/config_docusaurus.yml"] +fmt = "ruff check --fix {args}; ruff format {args}" +fmt-check = "ruff check {args} && ruff format --check {args}" + +[tool.hatch.envs.test] +dependencies = [ + "pytest", + "pytest-asyncio", + "pytest-cov", + "pytest-rerunfailures", + "mypy", + "pip", +] + +[tool.hatch.envs.test.scripts] +unit = 'pytest -m "not integration" {args:tests}' +integration = 'pytest -m "integration" {args:tests}' +all = 'pytest {args:tests}' +unit-cov-retry = 'pytest --cov=haystack_integrations --reruns 3 --reruns-delay 30 -x -m "not integration" {args:tests}' +integration-cov-append-retry = 'pytest --cov=haystack_integrations --cov-append --reruns 3 --reruns-delay 30 -x -m "integration" {args:tests}' +types = "mypy -p haystack_integrations.components.converters.amazon_textract {args}" + +[tool.mypy] +install_types = true +non_interactive = true +check_untyped_defs = true +disallow_incomplete_defs = true + +[[tool.mypy.overrides]] +module = [ + "botocore.*", + "boto3.*", +] +ignore_missing_imports = true + +[tool.ruff] +line-length = 120 + +[tool.ruff.lint] +select = [ + "A", + "ANN", + "ARG", + "B", + "C", + "D102", # Missing docstring in public method + "D103", # Missing docstring in public function + "D205", # 1 blank line required between summary line and description + "D209", # Closing triple quotes go to new line + "D213", # summary lines must be positioned on the second physical line of the docstring + "D417", # Missing argument descriptions in the docstring + "D419", # Docstring is empty + "DTZ", + "E", + "EM", + "F", + "I", + "ICN", + "ISC", + "N", + "PLC", + "PLE", + "PLR", + "PLW", + "Q", + "RUF", + "S", + "T", + "TID", + "UP", + "W", + "YTT", +] +ignore = [ + # Allow non-abstract empty methods in abstract base classes + "B027", + # Allow function calls in argument defaults (common Haystack pattern for Secret.from_env_var) + "B008", + # Ignore checks for possible passwords + "S105", + "S106", + "S107", + # Ignore complexity + "C901", + "PLR0911", + "PLR0912", + "PLR0913", + "PLR0915", + # Allow `Any` type - used legitimately for dynamic types and SDK boundaries + "ANN401", +] + +[tool.ruff.lint.isort] +known-first-party = ["haystack_integrations"] + +[tool.ruff.lint.flake8-tidy-imports] +ban-relative-imports = "parents" + +[tool.ruff.lint.per-file-ignores] +# Tests can use magic values, assertions, relative imports, and don't need type annotations +"tests/**/*" = ["PLR2004", "S101", "TID252", "D", "ANN"] + +[tool.coverage.run] +source = ["haystack_integrations"] +branch = true +parallel = false +relative_files = true + +[tool.coverage.report] +omit = ["*/tests/*", "*/__init__.py"] +show_missing = true +exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] + +[tool.pytest.ini_options] +addopts = "--strict-markers" +markers = [ + "integration: integration tests", +] +log_cli = true +asyncio_default_fixture_loop_scope = "function" diff --git a/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/__init__.py b/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/__init__.py new file mode 100644 index 0000000000..3ef76c1906 --- /dev/null +++ b/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from haystack_integrations.components.converters.amazon_textract.converter import AmazonTextractConverter + +__all__ = ["AmazonTextractConverter"] diff --git a/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/converter.py b/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/converter.py new file mode 100644 index 0000000000..805cc2e00e --- /dev/null +++ b/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/converter.py @@ -0,0 +1,273 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import os +from pathlib import Path +from typing import Any + +import boto3 +from botocore.config import Config +from botocore.exceptions import BotoCoreError, ClientError +from haystack import Document, component, default_from_dict, default_to_dict, logging +from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata +from haystack.dataclasses import ByteStream +from haystack.utils import Secret, deserialize_secrets_inplace + +from haystack_integrations.components.converters.amazon_textract.errors import ( + AmazonTextractConfigurationError, +) + +logger = logging.getLogger(__name__) + +VALID_FEATURE_TYPES = frozenset({"TABLES", "FORMS", "SIGNATURES", "LAYOUT"}) + + +@component +class AmazonTextractConverter: + """ + Converts documents to Haystack Documents using AWS Textract. + + This component uses AWS Textract to extract text and optionally structured data + (tables, forms) from images and single-page PDFs. + + When `feature_types` is not set, the component uses `DetectDocumentText` for + plain text OCR. When `feature_types` is set (e.g. `["TABLES", "FORMS"]`), it + uses `AnalyzeDocument` for richer structural analysis. + + Natural-language queries are also supported via the `queries` parameter on + `run()`. When queries are provided, the `QUERIES` feature type is added + automatically and Textract returns answers extracted from the document. + + Supported input formats: JPEG, PNG, TIFF, BMP, and single-page PDF (up to 10 MB). + + AWS credentials are resolved via `Secret` parameters or the default boto3 + credential chain (environment variables, AWS config files, IAM roles). + + ### Usage example + + ```python + from haystack_integrations.components.converters.amazon_textract import AmazonTextractConverter + + converter = AmazonTextractConverter() + results = converter.run(sources=["document.png"]) + documents = results["documents"] + ``` + """ + + def __init__( + self, + *, + aws_access_key_id: Secret | None = Secret.from_env_var("AWS_ACCESS_KEY_ID", strict=False), + aws_secret_access_key: Secret | None = Secret.from_env_var("AWS_SECRET_ACCESS_KEY", strict=False), + aws_session_token: Secret | None = Secret.from_env_var("AWS_SESSION_TOKEN", strict=False), + aws_region_name: Secret | None = Secret.from_env_var("AWS_DEFAULT_REGION", strict=False), + aws_profile_name: Secret | None = Secret.from_env_var("AWS_PROFILE", strict=False), + feature_types: list[str] | None = None, + store_full_path: bool = False, + boto3_config: dict[str, Any] | None = None, + ) -> None: + """ + Creates an AmazonTextractConverter component. + + :param aws_access_key_id: AWS access key ID. + :param aws_secret_access_key: AWS secret access key. + :param aws_session_token: AWS session token. + :param aws_region_name: AWS region name. Must be a region that supports Textract. + :param aws_profile_name: AWS profile name from the credentials file. + :param feature_types: + List of feature types to detect when using AnalyzeDocument. + Valid values: "TABLES", "FORMS", "SIGNATURES", "LAYOUT". + If None, uses DetectDocumentText for basic text extraction. + The "QUERIES" feature type is managed automatically when the + `queries` parameter is passed to `run()`. + :param store_full_path: + If True, stores the complete file path in Document metadata. + If False, stores only the filename (default). + :param boto3_config: + Dictionary of configuration options for the underlying boto3 client. + Can be used to tune retry behavior, timeouts, and connection management. + """ + if feature_types is not None: + invalid = set(feature_types) - VALID_FEATURE_TYPES + if invalid: + msg = f"Invalid feature_types: {invalid}. Valid values are: {sorted(VALID_FEATURE_TYPES)}" + raise ValueError(msg) + + self.aws_access_key_id = aws_access_key_id + self.aws_secret_access_key = aws_secret_access_key + self.aws_session_token = aws_session_token + self.aws_region_name = aws_region_name + self.aws_profile_name = aws_profile_name + self.feature_types = feature_types + self.store_full_path = store_full_path + self.boto3_config = boto3_config + self._client: Any = None + + def warm_up(self) -> None: + """Initializes the AWS Textract client.""" + if self._client is not None: + return + + def resolve_secret(secret: Secret | None) -> str | None: + return secret.resolve_value() if secret else None + + try: + session = boto3.Session( + aws_access_key_id=resolve_secret(self.aws_access_key_id), + aws_secret_access_key=resolve_secret(self.aws_secret_access_key), + aws_session_token=resolve_secret(self.aws_session_token), + region_name=resolve_secret(self.aws_region_name), + profile_name=resolve_secret(self.aws_profile_name), + ) + config = Config( + user_agent_extra="x-client-framework:haystack", + **(self.boto3_config if self.boto3_config else {}), + ) + self._client = session.client("textract", config=config) + except Exception as e: + msg = ( + "Could not connect to AWS Textract. Make sure the AWS environment is configured correctly. " + "See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/quickstart.html#configuration" + ) + raise AmazonTextractConfigurationError(msg) from e + + @component.output_types(documents=list[Document], raw_textract_response=list[dict]) + def run( + self, + sources: list[str | Path | ByteStream], + meta: dict[str, Any] | list[dict[str, Any]] | None = None, + queries: list[str] | None = None, + ) -> dict[str, Any]: + """ + Convert documents to Haystack Documents using AWS Textract. + + :param sources: + List of file paths or ByteStream objects to convert. + :param meta: + Optional metadata to attach to the Documents. + This value can be either a list of dictionaries or a single dictionary. + If it's a single dictionary, its content is added to the metadata of all produced Documents. + If it's a list, the length of the list must match the number of sources. + :param queries: + Optional list of natural-language questions to ask about each document. + When provided, the Textract ``QUERIES`` feature type is enabled + automatically and each question is sent as a query. Answers are + included in the raw Textract response. Example: + ``["What is the patient name?", "What is the total due?"]`` + :returns: + A dictionary with the following keys: + - `documents`: List of created Documents with extracted text as content. + - `raw_textract_response`: List of raw Textract API responses. + """ + if self._client is None: + self.warm_up() + + documents: list[Document] = [] + raw_responses: list[dict[str, Any]] = [] + meta_list = normalize_metadata(meta=meta, sources_count=len(sources)) + + for source, metadata in zip(sources, meta_list, strict=True): + try: + bytestream = get_bytestream_from_source(source=source) + except Exception as e: + logger.warning("Could not read {source}. Skipping it. Error: {error}", source=source, error=e) + continue + + try: + response = self._call_textract(bytestream.data, queries=queries) + raw_responses.append(response) + + merged_metadata = {**bytestream.meta, **metadata} + if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): + merged_metadata["file_path"] = os.path.basename(file_path) + + doc = self._create_document(response, merged_metadata) + documents.append(doc) + + except (BotoCoreError, ClientError) as e: + logger.warning( + "Failed to convert {source} using AWS Textract. Skipping it. Error: {error}", + source=source, + error=e, + ) + continue + + return {"documents": documents, "raw_textract_response": raw_responses} + + def _call_textract(self, document_bytes: bytes, queries: list[str] | None = None) -> dict[str, Any]: + """Calls the appropriate Textract API based on configuration.""" + doc_param: dict[str, Any] = {"Document": {"Bytes": document_bytes}} + + feature_types = list(self.feature_types) if self.feature_types else [] + if queries: + if "QUERIES" not in feature_types: + feature_types.append("QUERIES") + + if feature_types: + kwargs: dict[str, Any] = {**doc_param, "FeatureTypes": feature_types} + if queries: + kwargs["QueriesConfig"] = {"Queries": [{"Text": q} for q in queries]} + return self._client.analyze_document(**kwargs) + + return self._client.detect_document_text(**doc_param) + + def _create_document(self, response: dict[str, Any], meta: dict[str, Any]) -> Document: + """ + Creates a Document from a Textract response. + + Extracts LINE blocks in reading order and joins them with newlines. + """ + blocks = response.get("Blocks", []) + lines = [block["Text"] for block in blocks if block.get("BlockType") == "LINE" and "Text" in block] + content = "\n".join(lines) + + page_count = sum(1 for block in blocks if block.get("BlockType") == "PAGE") + + doc_meta = { + **meta, + "page_count": page_count, + } + + return Document(content=content, meta=doc_meta) + + def to_dict(self) -> dict[str, Any]: + """ + Serializes the component to a dictionary. + + :returns: + Dictionary with serialized data. + """ + return default_to_dict( + self, + aws_access_key_id=self.aws_access_key_id, + aws_secret_access_key=self.aws_secret_access_key, + aws_session_token=self.aws_session_token, + aws_region_name=self.aws_region_name, + aws_profile_name=self.aws_profile_name, + feature_types=self.feature_types, + store_full_path=self.store_full_path, + boto3_config=self.boto3_config, + ) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "AmazonTextractConverter": + """ + Deserializes the component from a dictionary. + + :param data: + The dictionary to deserialize from. + :returns: + The deserialized component. + """ + deserialize_secrets_inplace( + data["init_parameters"], + keys=[ + "aws_access_key_id", + "aws_secret_access_key", + "aws_session_token", + "aws_region_name", + "aws_profile_name", + ], + ) + return default_from_dict(cls, data) diff --git a/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/errors.py b/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/errors.py new file mode 100644 index 0000000000..4e65ac986f --- /dev/null +++ b/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/errors.py @@ -0,0 +1,11 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + + +class AmazonTextractError(Exception): + """Any error generated by the Amazon Textract integration.""" + + +class AmazonTextractConfigurationError(AmazonTextractError): + """Exception raised when AWS is not configured correctly for Textract.""" diff --git a/integrations/amazon_textract/src/haystack_integrations/components/converters/py.typed b/integrations/amazon_textract/src/haystack_integrations/components/converters/py.typed new file mode 100644 index 0000000000..e69de29bb2 diff --git a/integrations/amazon_textract/tests/__init__.py b/integrations/amazon_textract/tests/__init__.py new file mode 100644 index 0000000000..c1764a6e03 --- /dev/null +++ b/integrations/amazon_textract/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/integrations/amazon_textract/tests/test_amazon_textract_converter.py b/integrations/amazon_textract/tests/test_amazon_textract_converter.py new file mode 100644 index 0000000000..46a7529b29 --- /dev/null +++ b/integrations/amazon_textract/tests/test_amazon_textract_converter.py @@ -0,0 +1,510 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +from botocore.exceptions import ClientError +from haystack.dataclasses import ByteStream +from haystack.utils import Secret + +from haystack_integrations.components.converters.amazon_textract import AmazonTextractConverter +from haystack_integrations.components.converters.amazon_textract.errors import ( + AmazonTextractConfigurationError, +) + +TEST_FILES_DIR = Path(__file__).parent / "test_files" + + +def _make_textract_response(lines=None, page_count=1): + """Helper to build a mock Textract response dict.""" + blocks = [{"BlockType": "PAGE", "Id": f"page-{i}"} for i in range(page_count)] + for text in lines or []: + blocks.append( + { + "BlockType": "LINE", + "Id": f"line-{len(blocks)}", + "Text": text, + "Confidence": 99.5, + } + ) + return {"Blocks": blocks, "ResponseMetadata": {"HTTPStatusCode": 200}} + + +class TestAmazonTextractConverterInit: + def test_init_default(self): + converter = AmazonTextractConverter( + aws_access_key_id=Secret.from_token("fake_id"), + aws_secret_access_key=Secret.from_token("fake_secret"), + aws_region_name=Secret.from_token("us-east-1"), + ) + + assert converter.feature_types is None + assert converter.store_full_path is False + assert converter.boto3_config is None + + def test_init_custom_params(self): + converter = AmazonTextractConverter( + aws_access_key_id=Secret.from_token("fake_id"), + aws_secret_access_key=Secret.from_token("fake_secret"), + aws_region_name=Secret.from_token("eu-west-1"), + feature_types=["TABLES", "FORMS"], + store_full_path=True, + boto3_config={"connect_timeout": 10}, + ) + + assert converter.feature_types == ["TABLES", "FORMS"] + assert converter.store_full_path is True + assert converter.boto3_config == {"connect_timeout": 10} + + def test_init_invalid_feature_types(self): + with pytest.raises(ValueError, match="Invalid feature_types"): + AmazonTextractConverter( + aws_access_key_id=Secret.from_token("fake"), + aws_secret_access_key=Secret.from_token("fake"), + feature_types=["INVALID_TYPE"], + ) + + def test_init_all_valid_feature_types(self): + converter = AmazonTextractConverter( + aws_access_key_id=Secret.from_token("fake"), + aws_secret_access_key=Secret.from_token("fake"), + feature_types=["TABLES", "FORMS", "SIGNATURES", "LAYOUT"], + ) + assert len(converter.feature_types) == 4 + + +class TestAmazonTextractConverterSerialization: + def test_to_dict(self): + converter = AmazonTextractConverter( + aws_access_key_id=Secret.from_env_var("AWS_ACCESS_KEY_ID"), + aws_secret_access_key=Secret.from_env_var("AWS_SECRET_ACCESS_KEY"), + aws_session_token=Secret.from_env_var("AWS_SESSION_TOKEN", strict=False), + aws_region_name=Secret.from_env_var("AWS_DEFAULT_REGION", strict=False), + aws_profile_name=Secret.from_env_var("AWS_PROFILE", strict=False), + feature_types=["TABLES"], + store_full_path=True, + boto3_config={"connect_timeout": 5}, + ) + + data = converter.to_dict() + + expected_type = "haystack_integrations.components.converters.amazon_textract.converter.AmazonTextractConverter" + assert data["type"] == expected_type + assert data["init_parameters"]["feature_types"] == ["TABLES"] + assert data["init_parameters"]["store_full_path"] is True + assert data["init_parameters"]["boto3_config"] == {"connect_timeout": 5} + assert data["init_parameters"]["aws_access_key_id"] == { + "type": "env_var", + "env_vars": ["AWS_ACCESS_KEY_ID"], + "strict": True, + } + + def test_to_dict_default_params(self): + converter = AmazonTextractConverter( + aws_access_key_id=Secret.from_env_var("AWS_ACCESS_KEY_ID", strict=False), + aws_secret_access_key=Secret.from_env_var("AWS_SECRET_ACCESS_KEY", strict=False), + ) + + data = converter.to_dict() + + assert data["init_parameters"]["feature_types"] is None + assert data["init_parameters"]["store_full_path"] is False + assert data["init_parameters"]["boto3_config"] is None + + def test_from_dict(self): + expected_type = "haystack_integrations.components.converters.amazon_textract.converter.AmazonTextractConverter" + data = { + "type": expected_type, + "init_parameters": { + "aws_access_key_id": {"type": "env_var", "env_vars": ["AWS_ACCESS_KEY_ID"], "strict": False}, + "aws_secret_access_key": {"type": "env_var", "env_vars": ["AWS_SECRET_ACCESS_KEY"], "strict": False}, + "aws_session_token": {"type": "env_var", "env_vars": ["AWS_SESSION_TOKEN"], "strict": False}, + "aws_region_name": {"type": "env_var", "env_vars": ["AWS_DEFAULT_REGION"], "strict": False}, + "aws_profile_name": {"type": "env_var", "env_vars": ["AWS_PROFILE"], "strict": False}, + "feature_types": ["TABLES", "FORMS"], + "store_full_path": False, + "boto3_config": None, + }, + } + + converter = AmazonTextractConverter.from_dict(data) + + assert converter.feature_types == ["TABLES", "FORMS"] + assert converter.store_full_path is False + assert converter.boto3_config is None + + def test_from_dict_roundtrip(self): + converter = AmazonTextractConverter( + aws_access_key_id=Secret.from_env_var("AWS_ACCESS_KEY_ID", strict=False), + aws_secret_access_key=Secret.from_env_var("AWS_SECRET_ACCESS_KEY", strict=False), + feature_types=["FORMS"], + store_full_path=True, + ) + + data = converter.to_dict() + restored = AmazonTextractConverter.from_dict(data) + + assert restored.feature_types == converter.feature_types + assert restored.store_full_path == converter.store_full_path + assert restored.boto3_config == converter.boto3_config + + +class TestAmazonTextractConverterWarmUp: + @patch("haystack_integrations.components.converters.amazon_textract.converter.boto3.Session") + def test_warm_up_creates_client(self, mock_session_cls): + mock_session = MagicMock() + mock_client = MagicMock() + mock_session.client.return_value = mock_client + mock_session_cls.return_value = mock_session + + converter = AmazonTextractConverter( + aws_access_key_id=Secret.from_token("fake_id"), + aws_secret_access_key=Secret.from_token("fake_secret"), + aws_region_name=Secret.from_token("us-east-1"), + ) + converter.warm_up() + + mock_session_cls.assert_called_once() + mock_session.client.assert_called_once() + call_args = mock_session.client.call_args + assert call_args[0][0] == "textract" + assert converter._client is mock_client + + @patch("haystack_integrations.components.converters.amazon_textract.converter.boto3.Session") + def test_warm_up_idempotent(self, mock_session_cls): + mock_session = MagicMock() + mock_session.client.return_value = MagicMock() + mock_session_cls.return_value = mock_session + + converter = AmazonTextractConverter( + aws_access_key_id=Secret.from_token("fake"), + aws_secret_access_key=Secret.from_token("fake"), + ) + converter.warm_up() + converter.warm_up() + + mock_session_cls.assert_called_once() + + @patch( + "haystack_integrations.components.converters.amazon_textract.converter.boto3.Session", + side_effect=Exception("bad config"), + ) + def test_warm_up_configuration_error(self, _mock_session_cls): + converter = AmazonTextractConverter( + aws_access_key_id=Secret.from_token("fake"), + aws_secret_access_key=Secret.from_token("fake"), + ) + with pytest.raises(AmazonTextractConfigurationError, match="Could not connect to AWS Textract"): + converter.warm_up() + + +class TestAmazonTextractConverterRun: + def _make_converter_with_mock_client(self, feature_types=None, store_full_path=False): + converter = AmazonTextractConverter( + aws_access_key_id=Secret.from_token("fake"), + aws_secret_access_key=Secret.from_token("fake"), + aws_region_name=Secret.from_token("us-east-1"), + feature_types=feature_types, + store_full_path=store_full_path, + ) + converter._client = MagicMock() + return converter + + def test_run_detect_text(self, tmp_path): + converter = self._make_converter_with_mock_client() + response = _make_textract_response(lines=["Hello World", "Second line"]) + converter._client.detect_document_text.return_value = response + + test_file = tmp_path / "test.png" + test_file.write_bytes(b"fake image bytes") + + result = converter.run(sources=[test_file]) + + assert len(result["documents"]) == 1 + assert result["documents"][0].content == "Hello World\nSecond line" + assert result["documents"][0].meta["page_count"] == 1 + assert len(result["raw_textract_response"]) == 1 + converter._client.detect_document_text.assert_called_once() + + def test_run_analyze_document(self, tmp_path): + converter = self._make_converter_with_mock_client(feature_types=["TABLES", "FORMS"]) + response = _make_textract_response(lines=["Name: John", "Age: 30"]) + converter._client.analyze_document.return_value = response + + test_file = tmp_path / "form.png" + test_file.write_bytes(b"fake image bytes") + + result = converter.run(sources=[test_file]) + + assert len(result["documents"]) == 1 + assert result["documents"][0].content == "Name: John\nAge: 30" + converter._client.analyze_document.assert_called_once() + call_kwargs = converter._client.analyze_document.call_args[1] + assert call_kwargs["FeatureTypes"] == ["TABLES", "FORMS"] + + def test_run_with_metadata(self, tmp_path): + converter = self._make_converter_with_mock_client() + converter._client.detect_document_text.return_value = _make_textract_response(lines=["text"]) + + test_file = tmp_path / "doc.png" + test_file.write_bytes(b"bytes") + + result = converter.run( + sources=[test_file], + meta={"custom_key": "custom_value"}, + ) + + doc = result["documents"][0] + assert doc.meta["custom_key"] == "custom_value" + assert doc.meta["page_count"] == 1 + + def test_run_with_bytestream(self): + converter = self._make_converter_with_mock_client() + converter._client.detect_document_text.return_value = _make_textract_response(lines=["from bytes"]) + + bs = ByteStream(data=b"fake image", meta={"file_path": "/some/path/image.png"}) + result = converter.run(sources=[bs]) + + assert len(result["documents"]) == 1 + assert result["documents"][0].content == "from bytes" + assert result["documents"][0].meta["file_path"] == "image.png" + + def test_run_store_full_path(self, tmp_path): + converter = self._make_converter_with_mock_client(store_full_path=True) + converter._client.detect_document_text.return_value = _make_textract_response(lines=["text"]) + + test_file = tmp_path / "doc.png" + test_file.write_bytes(b"bytes") + + result = converter.run(sources=[test_file]) + + doc = result["documents"][0] + assert doc.meta["file_path"] == str(test_file) + + def test_run_store_basename_only(self, tmp_path): + converter = self._make_converter_with_mock_client(store_full_path=False) + converter._client.detect_document_text.return_value = _make_textract_response(lines=["text"]) + + test_file = tmp_path / "doc.png" + test_file.write_bytes(b"bytes") + + result = converter.run(sources=[test_file]) + + doc = result["documents"][0] + assert doc.meta["file_path"] == "doc.png" + + def test_run_multiple_sources(self, tmp_path): + converter = self._make_converter_with_mock_client() + converter._client.detect_document_text.side_effect = [ + _make_textract_response(lines=["First doc"]), + _make_textract_response(lines=["Second doc"]), + ] + + file1 = tmp_path / "a.png" + file1.write_bytes(b"bytes1") + file2 = tmp_path / "b.png" + file2.write_bytes(b"bytes2") + + result = converter.run(sources=[file1, file2]) + + assert len(result["documents"]) == 2 + assert result["documents"][0].content == "First doc" + assert result["documents"][1].content == "Second doc" + assert len(result["raw_textract_response"]) == 2 + + def test_run_multiple_sources_with_per_source_metadata(self, tmp_path): + converter = self._make_converter_with_mock_client() + converter._client.detect_document_text.side_effect = [ + _make_textract_response(lines=["A"]), + _make_textract_response(lines=["B"]), + ] + + file1 = tmp_path / "a.png" + file1.write_bytes(b"bytes1") + file2 = tmp_path / "b.png" + file2.write_bytes(b"bytes2") + + result = converter.run( + sources=[file1, file2], + meta=[{"source": "first"}, {"source": "second"}], + ) + + assert result["documents"][0].meta["source"] == "first" + assert result["documents"][1].meta["source"] == "second" + + def test_run_skips_failed_sources(self, tmp_path): + converter = self._make_converter_with_mock_client() + error_response = {"Error": {"Code": "InvalidParameterException", "Message": "bad image"}} + converter._client.detect_document_text.side_effect = ClientError(error_response, "DetectDocumentText") + + test_file = tmp_path / "broken_image.png" + test_file.write_bytes(b"bad bytes") + + result = converter.run(sources=[test_file]) + + assert len(result["documents"]) == 0 + assert len(result["raw_textract_response"]) == 0 + + def test_run_broken_image_mixed_with_valid(self, tmp_path): + """A broken image among valid sources should be skipped while valid ones succeed.""" + converter = self._make_converter_with_mock_client() + + error_response = {"Error": {"Code": "UnsupportedDocumentException", "Message": "unsupported format"}} + valid_response = _make_textract_response(lines=["Valid text"]) + + converter._client.detect_document_text.side_effect = [ + ClientError(error_response, "DetectDocumentText"), + valid_response, + ] + + valid_file = tmp_path / "good.png" + valid_file.write_bytes(b"fake valid image") + broken_image = TEST_FILES_DIR / "broken_image.png" + + result = converter.run(sources=[broken_image, valid_file]) + + assert len(result["documents"]) == 1 + assert result["documents"][0].content == "Valid text" + assert len(result["raw_textract_response"]) == 1 + + def test_run_skips_unreadable_source(self): + converter = self._make_converter_with_mock_client() + + result = converter.run(sources=["/nonexistent/path/file.png"]) + + assert len(result["documents"]) == 0 + assert len(result["raw_textract_response"]) == 0 + + def test_run_empty_response(self, tmp_path): + converter = self._make_converter_with_mock_client() + converter._client.detect_document_text.return_value = {"Blocks": [], "ResponseMetadata": {}} + + test_file = tmp_path / "empty.png" + test_file.write_bytes(b"bytes") + + result = converter.run(sources=[test_file]) + + assert len(result["documents"]) == 1 + assert result["documents"][0].content == "" + assert result["documents"][0].meta["page_count"] == 0 + + def test_run_multi_page_response(self, tmp_path): + converter = self._make_converter_with_mock_client() + response = _make_textract_response(lines=["Page 1 text", "Page 2 text"], page_count=2) + converter._client.detect_document_text.return_value = response + + test_file = tmp_path / "multipage.pdf" + test_file.write_bytes(b"fake pdf bytes") + + result = converter.run(sources=[test_file]) + + assert result["documents"][0].meta["page_count"] == 2 + assert "Page 1 text" in result["documents"][0].content + assert "Page 2 text" in result["documents"][0].content + + def test_run_auto_warm_up(self, tmp_path): + """Verify that run() calls warm_up() if client is not initialized.""" + converter = AmazonTextractConverter( + aws_access_key_id=Secret.from_token("fake"), + aws_secret_access_key=Secret.from_token("fake"), + aws_region_name=Secret.from_token("us-east-1"), + ) + + mock_client = MagicMock() + mock_client.detect_document_text.return_value = _make_textract_response(lines=["text"]) + + with patch("haystack_integrations.components.converters.amazon_textract.converter.boto3.Session") as mock_sess: + mock_sess.return_value.client.return_value = mock_client + + test_file = tmp_path / "test.png" + test_file.write_bytes(b"bytes") + + result = converter.run(sources=[test_file]) + + assert len(result["documents"]) == 1 + mock_sess.assert_called_once() + + def test_run_with_empty_sources(self): + converter = self._make_converter_with_mock_client() + result = converter.run(sources=[]) + + assert result["documents"] == [] + assert result["raw_textract_response"] == [] + + def test_run_with_queries_only(self, tmp_path): + """Queries alone should trigger AnalyzeDocument with QUERIES feature type.""" + converter = self._make_converter_with_mock_client() + response = _make_textract_response(lines=["John Doe"]) + converter._client.analyze_document.return_value = response + + test_file = tmp_path / "form.png" + test_file.write_bytes(b"fake image bytes") + + result = converter.run( + sources=[test_file], + queries=["What is the patient name?"], + ) + + assert len(result["documents"]) == 1 + converter._client.analyze_document.assert_called_once() + call_kwargs = converter._client.analyze_document.call_args[1] + assert "QUERIES" in call_kwargs["FeatureTypes"] + assert call_kwargs["QueriesConfig"] == { + "Queries": [{"Text": "What is the patient name?"}], + } + converter._client.detect_document_text.assert_not_called() + + def test_run_with_queries_and_feature_types(self, tmp_path): + """Queries combined with existing feature_types should merge correctly.""" + converter = self._make_converter_with_mock_client(feature_types=["TABLES"]) + response = _make_textract_response(lines=["Total: $100"]) + converter._client.analyze_document.return_value = response + + test_file = tmp_path / "invoice.png" + test_file.write_bytes(b"fake image bytes") + + result = converter.run( + sources=[test_file], + queries=["What is the total?", "What is the due date?"], + ) + + assert len(result["documents"]) == 1 + call_kwargs = converter._client.analyze_document.call_args[1] + assert "TABLES" in call_kwargs["FeatureTypes"] + assert "QUERIES" in call_kwargs["FeatureTypes"] + assert call_kwargs["QueriesConfig"] == { + "Queries": [ + {"Text": "What is the total?"}, + {"Text": "What is the due date?"}, + ], + } + + @pytest.mark.parametrize("queries", [None, []]) + def test_run_no_queries_uses_detect(self, tmp_path, queries): + """None and empty list for queries should both use detect_document_text.""" + converter = self._make_converter_with_mock_client() + converter._client.detect_document_text.return_value = _make_textract_response(lines=["text"]) + + test_file = tmp_path / "doc.png" + test_file.write_bytes(b"bytes") + + converter.run(sources=[test_file], queries=queries) + + converter._client.detect_document_text.assert_called_once() + converter._client.analyze_document.assert_not_called() + + def test_run_queries_does_not_mutate_feature_types(self, tmp_path): + """Passing queries at runtime should not modify the init feature_types.""" + converter = self._make_converter_with_mock_client(feature_types=["TABLES"]) + converter._client.analyze_document.return_value = _make_textract_response(lines=["text"]) + + test_file = tmp_path / "doc.png" + test_file.write_bytes(b"bytes") + + converter.run(sources=[test_file], queries=["What?"]) + + assert converter.feature_types == ["TABLES"] + assert "QUERIES" not in converter.feature_types diff --git a/integrations/amazon_textract/tests/test_amazon_textract_converter_integration.py b/integrations/amazon_textract/tests/test_amazon_textract_converter_integration.py new file mode 100644 index 0000000000..bbe79e9e04 --- /dev/null +++ b/integrations/amazon_textract/tests/test_amazon_textract_converter_integration.py @@ -0,0 +1,89 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import os +from pathlib import Path + +import pytest +from haystack.dataclasses import ByteStream + +from haystack_integrations.components.converters.amazon_textract import AmazonTextractConverter + +SKIP_REASON_NO_CREDENTIALS = "AWS credentials not available" +SKIP_REASON_NO_REGION = "AWS region not configured" + + +@pytest.mark.integration +class TestAmazonTextractConverterIntegration: + @pytest.fixture + def test_files_path(self): + return Path(__file__).parent / "test_files" + + @pytest.fixture + def converter(self): + return AmazonTextractConverter() + + @pytest.mark.skipif(not os.environ.get("AWS_ACCESS_KEY_ID"), reason=SKIP_REASON_NO_CREDENTIALS) + @pytest.mark.skipif(not os.environ.get("AWS_DEFAULT_REGION"), reason=SKIP_REASON_NO_REGION) + def test_run_detect_text_from_image(self, converter, test_files_path): + """Integration test: detect text from an image file.""" + image_path = test_files_path / "sample_text.png" + if not image_path.exists(): + pytest.skip("Test image file not available") + + results = converter.run(sources=[image_path]) + + assert "documents" in results + assert len(results["documents"]) == 1 + assert len(results["documents"][0].content) > 0 + assert results["documents"][0].meta["page_count"] >= 1 + assert "raw_textract_response" in results + assert len(results["raw_textract_response"]) == 1 + + @pytest.mark.skipif(not os.environ.get("AWS_ACCESS_KEY_ID"), reason=SKIP_REASON_NO_CREDENTIALS) + @pytest.mark.skipif(not os.environ.get("AWS_DEFAULT_REGION"), reason=SKIP_REASON_NO_REGION) + def test_run_analyze_document_with_tables(self, test_files_path): + """Integration test: analyze document with table detection.""" + image_path = test_files_path / "sample_text.png" + if not image_path.exists(): + pytest.skip("Test image file not available") + + converter = AmazonTextractConverter(feature_types=["TABLES"]) + results = converter.run(sources=[image_path]) + + assert "documents" in results + assert len(results["documents"]) == 1 + assert len(results["documents"][0].content) > 0 + + @pytest.mark.skipif(not os.environ.get("AWS_ACCESS_KEY_ID"), reason=SKIP_REASON_NO_CREDENTIALS) + @pytest.mark.skipif(not os.environ.get("AWS_DEFAULT_REGION"), reason=SKIP_REASON_NO_REGION) + def test_run_with_metadata(self, converter, test_files_path): + """Integration test: verify metadata handling.""" + image_path = test_files_path / "sample_text.png" + if not image_path.exists(): + pytest.skip("Test image file not available") + + results = converter.run( + sources=[image_path], + meta={"custom_key": "custom_value"}, + ) + + doc = results["documents"][0] + assert doc.meta["custom_key"] == "custom_value" + assert doc.meta["file_path"] == "sample_text.png" + + @pytest.mark.skipif(not os.environ.get("AWS_ACCESS_KEY_ID"), reason=SKIP_REASON_NO_CREDENTIALS) + @pytest.mark.skipif(not os.environ.get("AWS_DEFAULT_REGION"), reason=SKIP_REASON_NO_REGION) + def test_run_with_bytestream(self, converter, test_files_path): + """Integration test: convert from ByteStream.""" + image_path = test_files_path / "sample_text.png" + if not image_path.exists(): + pytest.skip("Test image file not available") + + data = image_path.read_bytes() + bs = ByteStream(data=data, meta={"file_path": str(image_path)}) + results = converter.run(sources=[bs]) + + assert len(results["documents"]) == 1 + assert len(results["documents"][0].content) > 0 diff --git a/integrations/amazon_textract/tests/test_files/broken_image.png b/integrations/amazon_textract/tests/test_files/broken_image.png new file mode 100644 index 0000000000000000000000000000000000000000..302caf170f7838f30764c1c66e8a525b719255e9 GIT binary patch literal 20 bcmeAS@N?(olHy`uVBq!ia0y~yU}OLQCMW{m literal 0 HcmV?d00001 diff --git a/integrations/amazon_textract/tests/test_files/sample_text.png b/integrations/amazon_textract/tests/test_files/sample_text.png new file mode 100644 index 0000000000000000000000000000000000000000..25d202e024d211ee2a9411083bdaad0664f060db GIT binary patch literal 23511 zcmeFY1y@`_)-H^@1r1IJ?(P~OxCgi35ZoJg_ux)&C%C%?ck9O8-R64ar7UO2eY7Wo|#>z}8~ zEFB!D%d8s2d-mQkcE=Eg@sq_hCIbrJN|U4~|PaS%5=5 ziM;~Ymsz?=Cl~mT#UaRsWJC<1q)UEuuVJX3%*G*l8oPpoBcRFN9RSEA=Z+EX9*ThR z-^3sk&HOYT9rf{ob|em_-0S=6l%uqeFo}_OSouIiZZl-g_El+3;I;l}+;ubb3zmPo zFkI3bo~Z(qMrG`wN%$l5Q>v<=x))~uw=igAitu<0t1m>Q!{NxJ7t)MlTM}qb!$~z| zx(uI^BMU!4{_w6~5F;Vt-{|63WKHFF6_D+XCF)MOs@oEqZ|EgKf_=U0)BR0SlbFF$ z@>yLj?BkdO1saOVeoVr^Ob`!e?D6Oow9&k=gQu45jh`?H*#xTcLqeCRK0!o^oe^NvHyg{+D!v*Zo&>H7%ti`VnoB5Y7t7f z)!EEV0vY@EyJdq2dZd ziQlt`OG#9nz1T)rTd#X_Gil~BYHhoJvznSA`s|wy+&btNq;Lc~Y++RHgQ|6C>|Fh# zGogy|{1fo0#8{rRjN7in{NbhDu8cWg|6((H3G;-^6^`FOt1rC)N+vxfTj``ke7#0~FQ_ zW*oj~qc)zlX6W3fw-N)pYNMwvdW7;@`lbtX2(Osi+uPTdBxHWL?pTtGB>M+?^HKjC zTvUh&5?s1n*dJ&b#0VGTJ${NgkSvh6-MD+(P#}Nq3wTj76mfq*912z6LEQ#lO5ovl zE=`CCQCdyJ1%IU9u-5SN-kOqWC)1AKFjq( zuu4>5!1u#$l66X9VEXry|B)j287}pKl8sO+ERnoC9?67ICGaJT=&J(Rew^zLl_UD$ z7kzS4C6vNnttG5KqGP`D(%_*=Zcj^FQ1PKMisStnop!88S`A#tr=Ge#P^*Xb4!JXe znC;d~#or5aU?IT`F8q$4N|X_q^xaewM?I9Y=h+MYxVyFHp67%4mNn=@)rOJ)B;MBS zMau(QJ5=@H@ulj6=?lqPpzo(C#IG>$0eJq3sUpJ^a@5#ZBMAJ6p$NsBb#u}HF$zpA zal$-!FWiLi<$kSx(|-AW@BZz6*DYux=5TQ~GTt~ZDP;;s+MI84DUMM4l4wmJs`sIi55@Sf!pu9h+GOTS7cdT1#|U)G|*u*I7kKrJ
sq*Nxn%^QF2m^S=lZ`zi{nC z2O92Rt=XkkO9f5^bcIy~`sXM~d&w|K24ZFIt(mZ)H?U8PM$ zLIs`V-_o6gC?*4^@I zpWd1N{@SkLVf^7xX0B%Henu1Z;Y+127J!4YsbeqW&Nzd< zbHr`u?;e#L5}UeURf#{VF9Z z1I)UU#-J?0yeYb#ckgl5E$0;pw9`b{R~N%NgTL)6??NGSyne z%r5wHMb6LRDSHRZEXma4^=^@FdA=HUw{#CJN$bMaW9uf6z202PyndoSr^R`1yLY3P)NHdZ zs7~7~V_cu*d5!2S)5a;j<AII;uJPAh;pp21Rw&-E~*&_G63ncGX$5{)kq@J%&7YOU7Q6iE!9v1yTTGkQ`ek3IRordHLLDLNBOZ4;LwUjE9r^GneUVFO6($95UB<4Ft6r)!n`6OIxBk8+~-^NV^4xw0zQHSLLho5 zv90BmE#FPu>||T|cshOT0kM`tqKEVJbf|ed7}{2_6Bnhraa> z(nX(}pX-HKDbURCj={dWFZ09Yr;C_XtIgRS9or!Xg9#Ze+y*cD7tcdF*LGO8DWK_A zRoBG>+j&w~-;-nea~?L)AZ-3{MwWXmq>&3Bq8$VaA&3t_ z1>#nrF1rtX4Jo;=e?@qqu2>VhBYbfx4Sc_0UA;Kp1Fk1HXT1{^>&h41s_Le}OkL5P)2$|49wSkqiAle#pOdfe=*{la>a%s>Y6{ zrnXKNcFtGJKy+{bqP>)s69fb<<=+=lT7}{Qod1I5H%(_vc{x5~I~yiL6FVbQCV-9o z-+Ukh0DNGtjj6LC8NkNc)`<@wNdAu$d|?0IU}kc%eV8zVRc zqm#R>vmt=d*6H&?|0CvqOZ*Q? z&Htt3z~QXhO6dtROGd_?72Yq5iInh$t!1 zgoO1Jlc+h>o;rCnyN`0Zb_Dhfc}jO~-b>lCrHwsKf_6sJ*ju8><$J}UC`A7CqotY( z3!oaP94bqIK#cpxj|U2y;{@T~DN3nDL=&E<3LyU-0*;3M_s4&u|A&?TWeZp+{wogu zSBk?dM8?Gssc)iHb%(mX&!egP>+PcH)6L=4zVN%&f|fb){bI2qspVpo9(0BK>bJwI zQRWnQ5`YM``h#J9)&ht^F+2_TiMkl-!a27U^}{;uSkgP{pq;c@wl>S1ev*= z?7ZVX`vpwuxazX_hpxOrg>=5uVS*;YvZIK()aDNRb@kRMc3O??V88&AAQeL z%1Gu;z|7{E=Y3PxE29G`7gOCB+uL*V>m6`3=k2CA&%MU@WU0pSurMWg3~Rk*kicp< zNkbohUAH=C+7}L;&Tt8MX1 z3RV%i%w<)5cOwwpmEopHxtMw;Y)){lRp8hz(upd&$;*X(p{| zk+{uDeW`L6&vlV}##d-Kl(27^NdoI&#^aW5Fzc+1&{q=)-p;9po4y_!ywiEM&sWQ) zFvTPN?sv4_R@2Z^b8uI$ieTbz?_qFTbyD$0wFh~T>R#z7`X17Jt|tdq)vZTmQ2Z&{KWLY|?^jK025`%pGh2tlgjUNnm>)Mo zNp&yj7EEIW@8TM5fP93TrT)dRYcjI@r}YUSFhL&F5ZB|bajYPf@8hUJrW-t=GRcXq z(>Q0LR-M_z_0`R+B7ZgN9k=C-_yo^33NM&jm(#^y@zrfy^RT+d&;E(Z4VpTCe%#N= zNJ9 zw)n5$igD7pt$W`~Tzs<#4ae2|?k_c<7%iX$Wa$#%kOtlC9RS6H{4@qKeI_|m;-1GH(1{>?}mO(Mvra=CeZq+}3nZ2%sd z1!8qVu}DAr&qI-ZT5NRtD{l`*d+Xy=!R`i=q$el}wS3R<_MKM!ej)aG*u)`LT}-PGqBD!gJ_3)Duukyc&3iun0l#wEvY#FK_BiRgFi)##JAx=R za-36ov`ubG!D${9#x3g=53ZyjS4^MJ%Tv)>@ zF%UI>e^NJj_sE$|r&3*b>L`1=^Y}_row+0o=8w0}trT{mT6k^`Bmr=DR7ALvs!(m} zoAEYj(tQMdRT#@$%<*S&3M@p54;yg_K4JtiUw*HqnP(rC1^UUOG*D1Bk8OrXej%d= z9U4A?*}XeRXqU*n?lt@RX&BJ5o175MwIYSaf5PHV&!4@a|90Q?&d7iAI}ltWAvT9C zaev@zICZu?kh5xNeh9R+?(!(CXQtxj!(%LM#p`Wq}f;vNl0t-E4Kn_Pq zawA7ghEllRFuR{il@e4Hcn2y!Z{Yb~AMx7newtL*vDv-Dc|OmN6~aX6F5yqBWRl)G+X#zel~D8vbqA_p`ReCh4yFH$?sR z?V`bX$gwjx=V;jC5`K#XX0=ZmA2M zCY1~4Pm$DTb1Np&_3&nCSD2(8ag(R(B7sp*cKM-`$mSxzrTH|Wo$u~uw#jZ?iuo(- zkS0FD$lOx!!LoTy^aYGHu&*$pP|6tk#~eSy`$2xJf&lKWuJ`RcXAI;1ujvfCf;z*9 z22u$5yN#pilvI zT}!|8krq3M3JE`X+l=%nB_o9&q+3+0vl$gO(t~69KKYE_QpwL|MBUtth#?7RS#N>}=Q3O8 zA!0Z2Bib-~rvV{vg6O$EmG3QjQ77al|Y{e>y(Y#QE(RV=4z3QA0$=)MJSYGY{t(Vn{ zH2}ofEuo$Y-%_cI@BPGa%N8t=tSk8v0_KcK^3=YyC+F_+<)tZQ0Nnbb{eoh{rF0TP z`pOOS?cl=$8LCPQOX4ed=KuQi6evZF4^WUKc1dK{5mf6G#uxLJ2wJ}bYoFEaQt>HG zd<)VrwHsB`ox61`ulsFu!REnb2LMFN&QDp8JxOODf#1oUC4^)2+dlmu4yF=#gN0f_ zxUm&L7~tK~M`=3(X!VyQr``Pe<8gjo_p|In0bWiO!CLACN7XsR<f;sz3YW= zb7r6L5DP_|JLb^ia9}8u_@?&4qW}hbgQPKd?pu$mbo!73Lt)DCQd#3gSixKy7@pg9 z1(`T|@de1Pb`Ec1W+UhlZDb)NplWm*V<2IU2I51hE4)w{aTh6M@y*lV4fPRQ5B-P= z#nJjbceu-hZp(9NvfEd#dJ@xYcD@$w zE@Q($X~6+E%udTCP{RE6+8w;0+fxiKwh#~uqy@bLJh(cgjp`F=E`xEkXkDzAsNqlv z5;c3_Nxe35esXq}k;OL9i(7a-RFD~;1S-tm-F*quNT|s1PhrhUx%xRlj6`6SQa}lF zYFV76K4p+FBDl!o5G z*wO29*ecsJ`tuFYhVo`)^SNIfY5+lL3j6b{NTU^5i4%Tbxgj33&Q@}X7?9wA`EtW_HD$m0L4=y z45o4%j?FkFR?)yE+a@5ceJ?n=`}}fk>qHIgwJJ*TfTM`%?uQ1G0@re)x*79Wc6|%W zF3L5(2uC&W0^nv3DqobQATUUzek&T390Vqz5>=VI5`{E^r^AnsNa;o<1Q)K}Ru%67 z#~kVgs$R=35rymKcnB~|sqBWP&F<6}P5C(F{t065`Q zLz_^)X&EdtvtbeP>_nUD9456HSojLWbXiqrP2Lpjan6Pbt1nyAWC zL0xWvOK7s-wuUTOpAd6{TBQnMK)7TPVSq&1-pojV1_|$0>FUFw=k5(EOlluyGp%sDJu0*Ke$6y$)_%3A~y5^5eE_ z|448EdwujV#djDQizsnHW(S9c(z<3A;7N4BkDlFFWR}jGi(xWDo)KG&RQ-&^3vo-l zh$l>r3tSvot_sNKzT#CeG~WJs!n*(3_wAT0CNvGil=Lzf=O?^gq}AT0+J5pMN4WW->VVApY2Z z!@-G;I`s>5)#2&&&bqzt_+|2 zw9PJzyv;M-nnBPx%9EVROp3%4-{%V+xdrO&p7rG7pSc@GtQ>_QnJ0!Q*-Ty<7jy_H zEuX5;`phr!gVafxq{%2mM6qT{_FXa>Baux?4RFv5AY&tWFhepA5t@G<%5NaV< z)*;lH1@s07a3hdA6jIM3gMJaiEN*FzA=Gu-q9AL#U#1Cfdh%UlXZ=v26~$UJiEt0? z?KT@bK%8wyAV48C9L!!O>P(;(fM3|`5q$~W9pI1JLOkC=3{}4%rb}kwA8BW&VQ1Q- z15a0mPh!8S^uwmh%#K`nMOA@srt64f-7-fB4{!LADAyr>^x6G1#3>3}F*pDK_D{k1 znsO9MI$+UmpwxgL+T7LH#G=V&b4VDB#=sMfVjo%`j3}0)(3LuUC2Ee_3<>*$4`SH- zF(_MlLLOoYW<@4ppEw5XZ{#(qxp}67B~tWkRCDOfL(FWn>+hR+cC)pYLO7rY)=YnD zJ~@MVjVsSKD|WHG#9T->&_*=$z!L*G(3Qk1tWF@SZi~;vdCJ zD7|kxgwca&g^a^!P^KE%FHLDh1KYuy?sjHER^$tm?hf=Ge?xREVhr#V*RJ~q4jwn2 z3aFTceFxiqDMLPiEgd-yGV~>E!mZgHnu|s<<|2%3i4eeuq;KHIAo#KNU&GMVzZy`F zg1(jJit2k7V~nJ;nIdtiC5|8xV5?-j;S6H^dMD4tC;7eh&|pc{}9!*F^FY*vN!$PxEC)(KAofVYZ|VO z@uS16bcB3(4*gq8W^T^_kMriXm&5av`ktD*Zt;Y6M(zm!$zU^9%Q%@#;j)UjZYUYy zi6I5tf+asg8G3fh@L%H>gJ%1g8lr)Y8v{s!^wRrHHW)Mx%BC(gl(q-IzSkIU%K~6G zPJ>=AzeVRFnr{Z3YlNrZzw3tQmNxos5*^TUSq)ncD#e6b0T-$bx&TQa!r*Zoqep8^ z!nETrQ@Nn2KI?sX?JXM+fEo4gHaa&~I!|R1crnA-uXYftfUA4p@T&&};?=MP0kTsX zQ8b(D8VU~vWRH>Km!=Gmsf!e+K~Re6V07(zy_2Qmnj!7G85&%_vV_;XVCokaSdK>D z?duD;PhXOU=LYomZdlf9f}njO5qo>A`<6UXFDUGom)Qw>koz}mEWB9^0u22DsF2ay3`du`*V$r4x9mLokHa5xi9hYJFnl}2dWynl&%;vS`4u9# z0q1}+DBWdgb_?moFTD0uk1m**T8;pT-qjj_)s*u|YD7o=y>@%EB@l0}z0nLg84!!E z4@#rhv3~)dqk-;cL;No5sFKN+Dk>g0OLPtVnwDZv4YtC3DuvSz?+u5E*neNtF+MEA zwwLt^r0@vQ4vrlSnRX&xEH>T}&cr1M*@V9#WR~aYhlvM^E+73!!R^S6jDVhwY&wuH zJz;gp67`Lha27DQ31Q$1+EECdj@(P3W5+nRi*yCqctq`osfzupS3KyrVp(-E= z9(X~l6021?S~dx7HQV*QE<*D@)TO>SLjApb>((+hODDMa06OCN_T23@rOV}E+4Zq- zFe_XrGi*?$#einc{XZuom=qToGU=(gLqshe#5fT;1PhdegSd(V&tP%mNzCJ@ z(0cH3Xr#vv312fYjz(%$QtrJB2w3?}pl_#^qA2^i(zQ56932#a7$u)Pa9+#!Ks+>G zJn*0cm7tB1FOi84X1CE&F7?1ZeSH!KlLf5$z6Ol)QHqGF76ZTq79>N2Sh(C39WZd1 z2M)@${Q8Q*eq=WrJ_zj&7+B%qOxcZe7g zVWh~iU@%Avxl2|Wxz%v_P-{q$k%8x%?hU$s+;54z zV)|)(UH?OYVDwc9pdHwJoMI$2sUVs#=~_JhuUoCSzYCDG9|{Cw|GFi61kYIiO&|jG zUq1X(LjKE#|MKDg4gG-H&RdPosgRxaQdGSgfXtP&X->)rUJu;?o7vtGi9826+4eKe z%lRDgzebxp@Jfz;8OJR-;xVIw#3X~nq*3$>C=$r`7-e-ktDdo3+%}zP3g(_To;;`4 zanq_^I~u@>v|g&q@#5TXD^qTN*}$n?B4;cUih=*9sX-zYcrHj+>^SqjNuzZ<=pMD2 zGc2y;|FbKGR$Y@txJT+=kB}Z&-6ctq@el#;w0HR|GvuOvpEFzg6Ni7MH{+JVp93>D z*Rs3P^=8A6?d}rq$K3Db0OaB8_jYky+Zv^4ZxD+w0uj#!<|7~1&do}_8?r*@%E+i} za23X_4D`U^8bX|ibW z?v22r-21-Iw_uiO=#QfC`%le2WH=Irw(})U2U6|@bpHh%A_isb$As9fM+U9@`G_|s z?+OzM1aKZt2hal4h&#dH*%R94a2j2Rk{Y$+&!X)lNycavy23?Ho3pO0ZCgRtJ5iO_ z#UDI2=Ld_f(&G@TKWl1If)OvkW z_vN~0vszc6TJMSN>)ms9Vlj1>vazR^7$FYy(CupB_v@&8;iR@fK}O~dAyuL0E2}2E zmco=cv{N0~YT{OenDAl>67RV)>Kso!eZVcFyRv)t-MogL{+-H%9$#Z5p7Qkm514VQ z*|mf=(sY{HwMQwTcgL9P#s)1@*xgl?r<2;?`>iOF=XwFwLp5KY6Jd|{tXK1jOwakO ztIG=J*T!t1Yw~`PT#DAnzQpd$<2&!Cz9*|ns?m3M>6H7O&v%K*qs)vBLK$w_I<9N3 z#_oV#S+>i<(yKin=?FA8%H6JUA=hcl_S&m`@?CY4$;h1BP7?Qe(jWl{En}s@mKJE& z>0eEFMV{sUc!~qO?E3s53|BAd3DCSSN)@0#L#?ug+T5|W9n>{4nGaVa>u2_3NoRwvBee#8*4%`S^kQvy^O~66>1F6`%KHZ-uHQG+zu2 zDL)uTX|TqC*>Z~jJR1F_ujEdNMw@m^a&NDs~N(85stCO4+JDV@dfg0A!eHl54GX02?u)^Ynla_5BJ2T$Yx z9Ua6X?(=3^W!&%nEDnn<;yJWyC5C1ZRpN=`O#zv@eJ}d_GjEkM?;R~)0u-Dz z4+r0U`6H(jC2j#Tenr8urm@E}&z7TPc$~1|cDyCz@v7^= zQ3xQ;vgx#K7&DSlw@=I%*2to)4o#A&T+D--WMR~%O1@1o<1zk*$HLw}45&G}S_!1$ zv*wYs$)-5>x+u+BaL2CZ*|xuLA>j9d)cpjBIZO#Y6@^!tt*aAu zGB${L)X_mS{AbP;qoMCxVqy-Pu%9@OY9WPuy}&b^@Gw(oHZAFolUT}r2N0m=az=9- zz|{J77C?AkM$brJG72UCzEcY}jj^k}HkhPob_gY>3m;mhKB>MG%K#qkr1tb>vDBiz zq90O9o-OtXsn;`5TWl15ELp>swTS&iecKQ_u|4z1>`M`6TCCR3!_1qI_;W2DHL}~7ewILM7u=tkkkp$Cxmqq)5pt@=n@sN};@5kGw zt6h!tAAU#HEl&&Ug7C)`%Q9yjcQ+R|KEyQ)-Y!Tts?6sLb-2a(OP(DHYt*VJOl3aZXTU;E zCwJGLzIRA9+Pq*f(+$~y<;?XE;;^>oJs?ps%KclQiR7dArVMMW3gs8F32LGCkJ-gs zr)cMx0f~&1<5Pi4&nJ<*$r5OXbshI`M#hw`4}mRTpeW*G!CODniseaCF72ug@62i( zGd!2sWzuApvA{H`U&4M`ze%8JQ+moob#n5o1~TIGpm}DzwXxMeAlo#k`c7vWh`h47U`g zb)(5lghmNu>05~UoPfoj;I`&s3aPu=tO&08;6_=Ss@GQZ$H-8^Npd_01b}XZ=S zuEvzo6`Ma9DD4MxCA#s~c=r803vmf-cKylduXXI{Dq_3(>{It;u{4G1-Z7o6IJsDe z301t@hm^4yGHH!caTQ1ARyt|rBLX%`OtwT6<%s7lN{tYVMFFaxC=Kt&%N3?w=kAj} z>(^CI6GFhp(@E-cl7$t|)AN`;?zYU0x?$ z9&y|6Vln}5^S18qf*F5^`#jrBv+z9^joD6;X>Ij#Ca7iZ*;iUjvv`}%rQBWD$eYeO zr(aF0_vB4_z1eq;GR|C%32htFbl0}AEn9$SD!!{OI|-9r$*+7j4nVhM2c0x7+gzQ6 zw5DVqiqkfa-kqzXM(d%J!?g!Tz%YrcD;&#dDg5ml|D_^OW*U*VULxv@s@y1fyC!WwvO-_ zsbJx)PaC+6wEdE^I6CJ;ZY?sn9W~$FJ}7Xb9x~aWugx-r5#FTXmm$Zm&O1}K`kAO?8;gC9Q&LCs01Rwt=39oow|8u`WQ? zP^4K%mnBhtX1yj@2r+Mh7$>s>F!9CaaPh`xnM_V^b~XI5$P)tE+Naas>U+*zm!oaH zE43TIC!^!(oSS?d>Kqor^M#j7e;&JP*O;rqg6`MXnAb=bYRe~ipe zqyd{Ox0HOo^Jpl?;J1^L)h0r7gD)0>waLzQZMJ>IP-|Et~F zUDpPXgSVag^Q)rCa7o*0YY~tWB`|#@o(upIR2=3srf%clw^^6+aI<9w+FesC1GPD= zaEYd{b0RaMgQ&tD41iAC@<6v~Ccu+%yQ#vslvWU$c(>WwXwUb>iWUv`seq|KQLNn_>ir%0YQ%bY^L`w? z-a+jYVKckg};iE+IMl`F2b$nCUGv2#j-yRn-OK>r$K8B- zL&=WutKHC{4Ryz>NwK-UE=AT;bbTa#j;b9#O&iy0BN^$qrgcZPeg_qbk@=)A=k`M( z<4&u$`mlRqqwvCKKapFEzmjoV(LqqR$KYHz%?ii2kXwJqiUn~S1|zaE%IbS$zibis z1k0%;1Q{I@dH|UI9YvC7!P~^W{WVXJXWGv!9G}{uKP^QvVQ9iCunM(}Ywfk_HAUd! z;k;?b(Nv((MYiSb>ROj2-?B_bB;WLLH)Vdtd*h?~j|nI}2A3bdH$B*XC~;2%o$z#7 z2MdAruTNKV1?3V=$2Wugq$8~RsLAu?8lj^e^hcsj$o473O^7MCt`ot5cFFj4?^sNd z@2U%agT-sN?aW+FX!*urd8RejdI~o5-fgaN>74S5Iwo{G0_1Xu>mt(LlOj~}7n_&7qb;B;d zG2OhGOj-M85)c)SRs3fT-)XmwIl1MO&sE$G`Vf!@X(n>@bbI`#llm^Xy6e%!I-IKB z<%BIQ&2XC}t|poZ4o5i=O zTO&7h>kccQ%C2xTqG=-bhu5Tm^AYYMNkwYZiS#rbL+W&^`=y2Fpkv2Z>UH;>kv=}K zvKE4Z;g~P3qTiA0GH8(zVu>ygiZ}8^rP~qF;-3fV82H>Cr~Z^5tKnu7#Iww(_kZ6y z-n|OujpcLs@n=EDmhN`pk9TBES(c9b+;edktdmZvNGc*E48{cxS?(ncg)kHXnI8lc z0tJ(ZNCE&y2?7Q~1meHZ|7FX6#o@oh;Qv|ez_9=F`gEHBi;OD;UPtc4OOnKcm%u64 zAC|I^+QIM{Hj62;WiaZ(6l_KP7xALTn>s||$-tnag|tv0jfF4h;jC3lk(_KL$m)R> z)lI76=T{V=b}R}{Oi^b(i6-0dW1g;5r^yzFev>qx=M>nK5=!DhBajBPZIhqiJFcK} zGcq2GibrN@&XwB*ud?+_H+zFkM-sjnRoxcs@G?R9%0WV<)2UbE+)@ggj$o&8SgHUT z-9QeZe=#^}Cu5^cJdYD@4;Py=V9ZddMs@aB2DhRs*u3nDQH1xzW}!lh%@))gB?z{# z*2p_{`7u`#|JObrA4mv3x%7R1UT0rW+xa52-)yWg*FfSX*Z9e=n^cqRdzlIp7<2Uf zJ5$p|&fzcGrdmDs-6Y#CQr{PcmeZyw;=lNdo|ngK=vjId3*2hw;LLM81FsJinl-)0 zJV{xPV2F>y?NX&q(}<8)_%_a#F8Qn-J>8Ku^C=*yp|iI7c7%^PCQw8TFRo0#`$)Yr z;_2tTCBW-=0}L`i0fV5jet%kX1=@JKX(+u>Og800;iiT86-hPAj3oCr=t~z58e)`O zL4h30j5_QbRxk8go#T_}HAcW4NL!ra)0#?x?ta#?-|GuTc(icXtbBWWTI`Z(b3FxV z>G{6iZju=jTeJf%x9ndphe(-^tJ-XN);m1iEa%ITWUJXHg`O+qGI^Af8MMg3D3HRJ zU1r~Sv+?xcZ8_(tua>i5TnZIp~9g_#5vb#G?{TX7*8L;&2VzR zc{?WIug!UT($$=*V(_C2QGC(jAs91)IGVq^=*!n=5y{vX*KxNZ+v<8s=aDg>5S)Du zHjmr1?K#Zdkmp+NKkN1O0bB7q?q|7;)LYK(yxy;)$3Fi>iQTPv+D+GNcZ)aGK7OFgotOnHpZIF?QN}1%t#)!5FZs zG4{z}uzf5U7|^Az8G2N0=5jRmCv6f&?jH~`uxJ=Og3a@LduD*vbAy!*=w@9Se&5ryObgZngj>51X=5{aA1{v)gv>PMg+8W)+2r-)8Io8uA`gK*!2D zF^#$;d6SrSEO7ePxcv0CPW~X0i|tQG>Zn!+HF!Id7G;5#Jijjo3C*n|vjY)50`S?*npbf>Iv4dlD}nnub{)G{K>S$|II0VK%esd=O8c1b;T1GH+Ka~^m~-CI!o_@ou7|Bie|K$0a{{arI5Uv) zPD$59@@;FxS#GLT#}Z#SNs2oRMi|~kvTD0xp&jFn>>5|qN=bT`BlPv6kNwTGWSd`} z&+F5f?8_a{4!!BT?Y#30`Tga35`EE9Xx3?aLV2iZDc7Vp%N1$N;GqXBS@;%W=5V>K zFDqf<#b7{Lv(1WT%tuyo05~sb)b_DlpnV3uGhymlx&@aT;YfBYmr~IprFFM@wLURd z_8x6DOP5djT5hQDtWne{R0R zf`JdHwqPjG1{mRnH1>IIlgouG*4I2!)w%1g9SjgVp4#d~W=0|4vOc)u|K9Bv)gH{S zgKXgpN;(WijQ!P<4HsDyN^AYlAY`4!p{Z)jpSG5o3m#~jcVKK0P4iJ%dBEXpF+BLb z6De=8!2O7BOr@l5)wgQ$z^85~a{2pWvPO#hq{(*=-GGCJ3gD^BcaO<0`y_%MCNR!{ z@aPjf?Qo{19aSf25iJXml~|r~LSVUL0c+IWOgcujUj|p{6V8U_iS#SqSAE}K@qt&- zwA0>}(3PnDU`!$Y%fogo{$mQW0f|Ll1#q0PWMed(&&7WI{-3 z6P2%8Kh1YDB?j#jN#DwgQe>^~L`Y$r!6fdPFjiR=b^18LO#m=~=OAcb+IBrzVm0n+ z{cgM6+^y&dm*=nT+(lr&2FBvyKi+CKvT`=bvo+$>#xdu%oYao$E|UT_{4v`&6x~`* zS9pNZf#RMMi+XNZdc;0&V0fotZYi)~=riB0Q%uo#HuH`4;u@LL^Q7sbh#a(mj{a-= z<6g$9YZhx$kZx}NqLcFs+3FtSv5VSRgT8f^_~@7b6v7T3S?(XZZ>6+Ik3ScguStC# zeu=l#D>oU;nMH{W-g%+M;_q)>r8_X-yPpQQ8)nE}gLQOJc>__SKNoXZo%Cw4khIBY z!DD^`sztp9_2MH65f95)dB@jjxbZWwob=k~&cmXNlcNjv^~bN!u_t~-Ho(SBA3_cb zj8s-}{O1Pb73A-+#Hjsw&*L~s7H@rM{0w8|f`X=5SFjxLMD`-h=U~kzykEtM!MPpV z%Tb)^7$KYAbb9ibdMQc?rP?u`C#dG({ngX;ink4VCzW?c{IkO|a8pVK9d2!3*4!5=SiTE6SYmMG@IYFyyC&`SApZ z|42a6A#Xw$_9uZlv?=PQQTb*aM?-gWCbj8fD9g%H1z2})df3p^7Ap$B&h1X?XiWcs z<0#CEU3;1yD(iZG^GuOnJOl)RL0g>dwfqx<=CA;tK}P#<;`*I*#Pl@ksVbL9pC|w z5cQ=FyvK6~AIn@}s@X#2Fh7E)x-suRDV(|CgeKO6F$d=VUpwdi57pMkafC=I94bXp zjX1~MA;U1mqa;aFbTBSW7`e=mQ4NME$4j0h85!j=t`EwP+%FBnOqYon`kiKf9*B3pyrx)}a{2F1^^t-EB_g$=*4; zBBk#9v^{y&E_y4gPcO{pIu)vJk$Ma)#B7ZYYe_ZzFvj|(N4OUqGuIX7c@R(zzvtry zJtT&8(l{8_fu*3_hHK*kk1QhuSCSD`gh=0Nf{ETMx_>DlZ5x)=cV*Vg2?#^>!(2Nc zG2;eMlb*14)>FsB4JN(o=ozrAGAcRk^zZ^Q;?E?VCfPqTF)8w2H~KWUSxqe~SmoiH zXX+G~by!?7uZeT3Y@~D&`Cg<8;wPCpqhEFJsjRc0Y91MFSrf>tFsvjwM8}G3)}Waz z^p*PkGlX7W_E|?+GtP6N)@)2us7fVISbzrOOjyueyvPmPW}Bj@*;H z*AsbhAozv#^x-Yppni^C1bw^k_=@>qRrVxVt8T8B(ew*BMSsYjg7%7SKCo7XkrSa# zaW(`B*pSp>40ZuJ31Pa$E%PZ>0R$N{IHQwy!c^>?q-KHfjy4soOp%e-Pn+a%c$!wZ zW!60HmXP7+NSy7o3Lf%$4}!#+<=2rffwso}rVJS~F@3Ef zI&BiHR%(AUv01bZvCGn6f41T`wApPfn2xQrIJzp0TKEW;8`kvX$6qdK*cdTWjeP64 zg!2yAli|eQiyA2!E3V0nTAUtF8wJzGap2i>BKEN5b+SKpyKhb9>QbwnDZ;)94?`$} zEdH)SpG`0tQgZ2lJD{4JwLN3#pT`(%dppXefHHR5edd$VaRB5NQc&@boCP<-{IYY zDXVh@ZsBXS6T3dd5mNFw8(H-eaEp0N5tKTM+FCcg=Fu-IEljy447DNRZKG_=Z>6BB z7L_&-6*1lBaTP99Flmz*6HR)MM}6J2dj|ZqKLc=?qiB$Se3pIPa?ER{jG5r!T*dKT zB$Dz@IJ1S=nw(?Mx_*t~@RznlrEn+Tq7Y#Umm5-i^PU+S+LI7F?THJ;1lqpqG5TXH z0KY%++j!-Dqrj#9P8qW%AL6LnQpvq?x|RLDIAXE~E`IXvs;Fwg)GF`w4SM*kL}+~o z6I@A#%%-v;7l%fk2_^iaMZpz~jIRk)Fi!cOK@qj}+hox0Cj__U+d4dfR<W6T%2DN#bI zSYa1CJ&gQH4ai0V$st~nXpt0}E-F!!s3phxoy6&5)va~M$ff}owZLvJqR z7!Q-b`O^rsOYO3(sO`9oPn}EGMZMhJGW^xQ^@H0yBd3ynlP!4P z-qi@3KlIq|5RMiSh44Y-%94gr2iIy1$NS(Z^l-~)pT>;B9Zxp)-u9YW7{3E0gw>5k zS!?Vr5-a?hzQ0E>$b%Ct3ubbjuPmVWXiRLU>4YWIl1rReU1%KGG~Y}cuF-~|OJMQ4 z4Md27P>n!O5!9k-!nwO;WYM+iGDJ6BVD%hIKULD25~6hiWY}fi z&&rgR(fUV}wS?A7v~jl2+X0_z=Eu)A9w^bij%BDP_*=A7GEYu_s4Qn{IQ2c|(Z5i; zg@(x`Q`uwP`c1U*!HWd+X1;1hSpcMm0U;YTDAzyN)}rFT(m!e}MktvMzyb|IkFmsR zNqZBu1%Zm4>skTb_;UL^J%!(C?&Bl*A#!^p`-Y4To~uDFEal)s?tJ^@IKG#v%I11S zl}*_aaq=osK|g);aKX@A1=EL-J9qfeC?sCmRbXDoWHxkcWQMDM3f((@ta_5BZfejm z!`}{9ou}{H+)jHkw@U>nfoGkZUAw3CfJUK%K{`B{+)O5AyrOwF4LIUa(Ho<@cWpk# zhH5INyWCh~ZJ#dftADUp`aGywOE1hu&oJwd@-B0B^t1Yr4 zkzg8&_PR>{pjxDs1Gtt=bp9{N)k;+a;l%f=6x(kSETqJ(J>YuJe)EHyoEIEu!9ioa ut}f}y|J;@Q(fE(B|0uZslUHpyt|N0xuFkt7+2FGa3CDk(wXZngd;4z>+sbbM literal 0 HcmV?d00001 From 543e1a67e1f56901e0bce6bf6e51eb95ff23cd1b Mon Sep 17 00:00:00 2001 From: Zafer Kiziltoprak Date: Mon, 13 Apr 2026 14:44:24 +0200 Subject: [PATCH 2/6] feat: add Amazon Textract examples --- .../examples/analyze_document_example.py | 25 ++++++++++++++ .../examples/queries_example.py | 34 +++++++++++++++++++ .../examples/text_extraction_example.py | 17 ++++++++++ integrations/amazon_textract/pyproject.toml | 1 + 4 files changed, 77 insertions(+) create mode 100644 integrations/amazon_textract/examples/analyze_document_example.py create mode 100644 integrations/amazon_textract/examples/queries_example.py create mode 100644 integrations/amazon_textract/examples/text_extraction_example.py diff --git a/integrations/amazon_textract/examples/analyze_document_example.py b/integrations/amazon_textract/examples/analyze_document_example.py new file mode 100644 index 0000000000..a8e2dd6c23 --- /dev/null +++ b/integrations/amazon_textract/examples/analyze_document_example.py @@ -0,0 +1,25 @@ +# To run this example, you will need to: +# 1) Set `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_DEFAULT_REGION` environment variables +# 2) Place a document image named `invoice.png` in the same directory as this script +# +# This example demonstrates structural analysis using AWS Textract's AnalyzeDocument API. +# Setting `feature_types` enables extraction of tables, forms, and layout information +# in addition to plain text. + +from haystack_integrations.components.converters.amazon_textract import AmazonTextractConverter + +converter = AmazonTextractConverter(feature_types=["TABLES", "FORMS"]) + +results = converter.run(sources=["invoice.png"]) + +for doc in results["documents"]: + print(f"--- {doc.meta.get('file_path', 'unknown')} ---") + print(doc.content) + print() + +raw = results["raw_textract_response"][0] +table_blocks = [b for b in raw.get("Blocks", []) if b.get("BlockType") == "TABLE"] +print(f"Tables found: {len(table_blocks)}") + +form_blocks = [b for b in raw.get("Blocks", []) if b.get("BlockType") == "KEY_VALUE_SET"] +print(f"Key-value pairs found: {len(form_blocks)}") diff --git a/integrations/amazon_textract/examples/queries_example.py b/integrations/amazon_textract/examples/queries_example.py new file mode 100644 index 0000000000..aa70efe689 --- /dev/null +++ b/integrations/amazon_textract/examples/queries_example.py @@ -0,0 +1,34 @@ +# To run this example, you will need to: +# 1) Set `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_DEFAULT_REGION` environment variables +# 2) Place a document image named `medical_form.png` in the same directory as this script +# +# This example demonstrates natural-language queries using AWS Textract. +# The QUERIES feature type is enabled automatically when you pass the `queries` +# parameter at runtime. Textract will attempt to find answers to each question +# in the document. + +from haystack_integrations.components.converters.amazon_textract import AmazonTextractConverter + +converter = AmazonTextractConverter() + +results = converter.run( + sources=["medical_form.png"], + queries=["What is the patient name?", "What is the date of birth?", "What is the diagnosis?"], +) + +for doc in results["documents"]: + print("--- Extracted text ---") + print(doc.content) + print() + +raw = results["raw_textract_response"][0] +query_blocks = [b for b in raw.get("Blocks", []) if b.get("BlockType") == "QUERY"] +for block in query_blocks: + question = block.get("Query", {}).get("Text", "") + print(f"Q: {question}") + +query_result_blocks = [b for b in raw.get("Blocks", []) if b.get("BlockType") == "QUERY_RESULT"] +for block in query_result_blocks: + answer = block.get("Text", "") + confidence = block.get("Confidence", 0) + print(f"A: {answer} (confidence: {confidence:.1f}%)") diff --git a/integrations/amazon_textract/examples/text_extraction_example.py b/integrations/amazon_textract/examples/text_extraction_example.py new file mode 100644 index 0000000000..e979e64146 --- /dev/null +++ b/integrations/amazon_textract/examples/text_extraction_example.py @@ -0,0 +1,17 @@ +# To run this example, you will need to: +# 1) Set `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_DEFAULT_REGION` environment variables +# 2) Place an image or single-page PDF named `document.png` in the same directory as this script +# +# This example demonstrates basic text extraction from a document image using +# AWS Textract's DetectDocumentText API. + +from haystack_integrations.components.converters.amazon_textract import AmazonTextractConverter + +converter = AmazonTextractConverter() + +results = converter.run(sources=["document.png"]) + +for doc in results["documents"]: + print(f"--- {doc.meta.get('file_path', 'unknown')} (pages: {doc.meta.get('page_count')}) ---") + print(doc.content) + print() diff --git a/integrations/amazon_textract/pyproject.toml b/integrations/amazon_textract/pyproject.toml index a90ff53fed..b96d08bf48 100644 --- a/integrations/amazon_textract/pyproject.toml +++ b/integrations/amazon_textract/pyproject.toml @@ -157,6 +157,7 @@ ban-relative-imports = "parents" [tool.ruff.lint.per-file-ignores] # Tests can use magic values, assertions, relative imports, and don't need type annotations "tests/**/*" = ["PLR2004", "S101", "TID252", "D", "ANN"] +"examples/**/*" = ["T201"] [tool.coverage.run] source = ["haystack_integrations"] From 9563b7c9a1bb032139059d45274047cd193a9ee0 Mon Sep 17 00:00:00 2001 From: Zafer Kiziltoprak Date: Mon, 13 Apr 2026 14:53:13 +0200 Subject: [PATCH 3/6] fix: linting issue with the github workflow file --- .github/workflows/amazon_textract.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/amazon_textract.yml b/.github/workflows/amazon_textract.yml index 82d7394127..becdeb4039 100644 --- a/.github/workflows/amazon_textract.yml +++ b/.github/workflows/amazon_textract.yml @@ -44,8 +44,8 @@ jobs: steps: - id: set run: | - echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> $GITHUB_OUTPUT - echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> $GITHUB_OUTPUT + echo 'os=${{ github.event_name == 'push' && '["ubuntu-latest"]' || env.TEST_MATRIX_OS }}' >> "$GITHUB_OUTPUT" + echo 'python-version=${{ github.event_name == 'push' && '["3.10"]' || env.TEST_MATRIX_PYTHON }}' >> "$GITHUB_OUTPUT" run: name: Python ${{ matrix.python-version }} on ${{ startsWith(matrix.os, 'macos-') && 'macOS' || startsWith(matrix.os, 'windows-') && 'Windows' || 'Linux' }} From ef756277aceed01ddeba875d95827f7348bc08bb Mon Sep 17 00:00:00 2001 From: Zafer Kiziltoprak Date: Mon, 13 Apr 2026 15:07:34 +0200 Subject: [PATCH 4/6] fix: typo causing faillure of api-reference-build in CI --- integrations/amazon_textract/pydoc/config_docusaurus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/amazon_textract/pydoc/config_docusaurus.yml b/integrations/amazon_textract/pydoc/config_docusaurus.yml index bcfb7104ac..61eaf37e11 100644 --- a/integrations/amazon_textract/pydoc/config_docusaurus.yml +++ b/integrations/amazon_textract/pydoc/config_docusaurus.yml @@ -9,5 +9,5 @@ processors: renderer: description: Amazon Textract integration for Haystack id: integrations-amazon_textract - filename: amazon-textract.md + filename: amazon_textract.md title: Amazon Textract From abe2ca4c0a9c78e8d8d1ede1dfe6b0ff97d6ce26 Mon Sep 17 00:00:00 2001 From: Zafer Kiziltoprak Date: Mon, 13 Apr 2026 15:41:21 +0200 Subject: [PATCH 5/6] fix: update naming `inconvention` --- .github/workflows/CI_coverage_comment.yml | 2 +- .github/workflows/amazon_textract.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/CI_coverage_comment.yml b/.github/workflows/CI_coverage_comment.yml index 03af909d75..d097192a0b 100644 --- a/.github/workflows/CI_coverage_comment.yml +++ b/.github/workflows/CI_coverage_comment.yml @@ -6,7 +6,7 @@ on: - "Test / aimlapi" - "Test / amazon-bedrock" - "Test / amazon-sagemaker" - - "Test / amazon_textract" + - "Test / amazon-textract" - "Test / anthropic" - "Test / arcadedb" - "Test / astra" diff --git a/.github/workflows/amazon_textract.yml b/.github/workflows/amazon_textract.yml index becdeb4039..3e907eead6 100644 --- a/.github/workflows/amazon_textract.yml +++ b/.github/workflows/amazon_textract.yml @@ -1,6 +1,6 @@ # This workflow comes from https://github.com/ofek/hatch-mypyc # https://github.com/ofek/hatch-mypyc/blob/5a198c0ba8660494d02716cfc9d79ce4adfb1442/.github/workflows/test.yml -name: Test / amazon_textract +name: Test / amazon-textract on: schedule: From 1c4865d0cc7d055f382780516b5f85c02336af99 Mon Sep 17 00:00:00 2001 From: Zafer Kiziltoprak Date: Mon, 13 Apr 2026 15:42:12 +0200 Subject: [PATCH 6/6] fix: remove redundant imports and errors --- integrations/amazon_textract/pyproject.toml | 4 +--- .../components/converters/amazon_textract/__init__.py | 5 +++-- .../components/converters/amazon_textract/converter.py | 9 +++------ .../components/converters/amazon_textract/errors.py | 6 +----- .../tests/test_amazon_textract_converter.py | 4 ++-- 5 files changed, 10 insertions(+), 18 deletions(-) diff --git a/integrations/amazon_textract/pyproject.toml b/integrations/amazon_textract/pyproject.toml index b96d08bf48..af772bb9bf 100644 --- a/integrations/amazon_textract/pyproject.toml +++ b/integrations/amazon_textract/pyproject.toml @@ -28,7 +28,6 @@ classifiers = [ "Programming Language :: Python :: 3.13", "Programming Language :: Python :: 3.14", "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", ] dependencies = [ "haystack-ai>=2.24.1", @@ -63,7 +62,6 @@ fmt-check = "ruff check {args} && ruff format --check {args}" [tool.hatch.envs.test] dependencies = [ "pytest", - "pytest-asyncio", "pytest-cov", "pytest-rerunfailures", "mypy", @@ -173,7 +171,7 @@ exclude_lines = ["no cov", "if __name__ == .__main__.:", "if TYPE_CHECKING:"] [tool.pytest.ini_options] addopts = "--strict-markers" markers = [ + "unit: unit tests", "integration: integration tests", ] log_cli = true -asyncio_default_fixture_loop_scope = "function" diff --git a/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/__init__.py b/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/__init__.py index 3ef76c1906..145fdb12d7 100644 --- a/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/__init__.py +++ b/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/__init__.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from haystack_integrations.components.converters.amazon_textract.converter import AmazonTextractConverter +from .converter import AmazonTextractConverter +from .errors import AmazonTextractConfigurationError -__all__ = ["AmazonTextractConverter"] +__all__ = ["AmazonTextractConfigurationError", "AmazonTextractConverter"] diff --git a/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/converter.py b/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/converter.py index 805cc2e00e..e6068ce1b7 100644 --- a/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/converter.py +++ b/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/converter.py @@ -2,7 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -import os from pathlib import Path from typing import Any @@ -14,9 +13,7 @@ from haystack.dataclasses import ByteStream from haystack.utils import Secret, deserialize_secrets_inplace -from haystack_integrations.components.converters.amazon_textract.errors import ( - AmazonTextractConfigurationError, -) +from .errors import AmazonTextractConfigurationError logger = logging.getLogger(__name__) @@ -125,7 +122,7 @@ def resolve_secret(secret: Secret | None) -> str | None: **(self.boto3_config if self.boto3_config else {}), ) self._client = session.client("textract", config=config) - except Exception as e: + except BotoCoreError as e: msg = ( "Could not connect to AWS Textract. Make sure the AWS environment is configured correctly. " "See https://boto3.amazonaws.com/v1/documentation/api/latest/guide/quickstart.html#configuration" @@ -180,7 +177,7 @@ def run( merged_metadata = {**bytestream.meta, **metadata} if not self.store_full_path and (file_path := bytestream.meta.get("file_path")): - merged_metadata["file_path"] = os.path.basename(file_path) + merged_metadata["file_path"] = Path(file_path).name doc = self._create_document(response, merged_metadata) documents.append(doc) diff --git a/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/errors.py b/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/errors.py index 4e65ac986f..3e77de21db 100644 --- a/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/errors.py +++ b/integrations/amazon_textract/src/haystack_integrations/components/converters/amazon_textract/errors.py @@ -3,9 +3,5 @@ # SPDX-License-Identifier: Apache-2.0 -class AmazonTextractError(Exception): - """Any error generated by the Amazon Textract integration.""" - - -class AmazonTextractConfigurationError(AmazonTextractError): +class AmazonTextractConfigurationError(Exception): """Exception raised when AWS is not configured correctly for Textract.""" diff --git a/integrations/amazon_textract/tests/test_amazon_textract_converter.py b/integrations/amazon_textract/tests/test_amazon_textract_converter.py index 46a7529b29..a7c0106047 100644 --- a/integrations/amazon_textract/tests/test_amazon_textract_converter.py +++ b/integrations/amazon_textract/tests/test_amazon_textract_converter.py @@ -6,7 +6,7 @@ from unittest.mock import MagicMock, patch import pytest -from botocore.exceptions import ClientError +from botocore.exceptions import BotoCoreError, ClientError from haystack.dataclasses import ByteStream from haystack.utils import Secret @@ -190,7 +190,7 @@ def test_warm_up_idempotent(self, mock_session_cls): @patch( "haystack_integrations.components.converters.amazon_textract.converter.boto3.Session", - side_effect=Exception("bad config"), + side_effect=BotoCoreError(), ) def test_warm_up_configuration_error(self, _mock_session_cls): converter = AmazonTextractConverter(