feat: add Amazon Textract examples

zafatar · zafatar · commit 543e1a67e1f5 · 2026-04-13T14:44:24.000+02:00
diff --git a/integrations/amazon_textract/examples/analyze_document_example.py b/integrations/amazon_textract/examples/analyze_document_example.py
@@ -0,0 +1,25 @@
+# To run this example, you will need to:
+# 1) Set `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_DEFAULT_REGION` environment variables
+# 2) Place a document image named `invoice.png` in the same directory as this script
+#
+# This example demonstrates structural analysis using AWS Textract's AnalyzeDocument API.
+# Setting `feature_types` enables extraction of tables, forms, and layout information
+# in addition to plain text.
+
+from haystack_integrations.components.converters.amazon_textract import AmazonTextractConverter
+
+converter = AmazonTextractConverter(feature_types=["TABLES", "FORMS"])
+
+results = converter.run(sources=["invoice.png"])
+
+for doc in results["documents"]:
+    print(f"--- {doc.meta.get('file_path', 'unknown')} ---")
+    print(doc.content)
+    print()
+
+raw = results["raw_textract_response"][0]
+table_blocks = [b for b in raw.get("Blocks", []) if b.get("BlockType") == "TABLE"]
+print(f"Tables found: {len(table_blocks)}")
+
+form_blocks = [b for b in raw.get("Blocks", []) if b.get("BlockType") == "KEY_VALUE_SET"]
+print(f"Key-value pairs found: {len(form_blocks)}")
diff --git a/integrations/amazon_textract/examples/queries_example.py b/integrations/amazon_textract/examples/queries_example.py
@@ -0,0 +1,34 @@
+# To run this example, you will need to:
+# 1) Set `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_DEFAULT_REGION` environment variables
+# 2) Place a document image named `medical_form.png` in the same directory as this script
+#
+# This example demonstrates natural-language queries using AWS Textract.
+# The QUERIES feature type is enabled automatically when you pass the `queries`
+# parameter at runtime. Textract will attempt to find answers to each question
+# in the document.
+
+from haystack_integrations.components.converters.amazon_textract import AmazonTextractConverter
+
+converter = AmazonTextractConverter()
+
+results = converter.run(
+    sources=["medical_form.png"],
+    queries=["What is the patient name?", "What is the date of birth?", "What is the diagnosis?"],
+)
+
+for doc in results["documents"]:
+    print("--- Extracted text ---")
+    print(doc.content)
+    print()
+
+raw = results["raw_textract_response"][0]
+query_blocks = [b for b in raw.get("Blocks", []) if b.get("BlockType") == "QUERY"]
+for block in query_blocks:
+    question = block.get("Query", {}).get("Text", "")
+    print(f"Q: {question}")
+
+query_result_blocks = [b for b in raw.get("Blocks", []) if b.get("BlockType") == "QUERY_RESULT"]
+for block in query_result_blocks:
+    answer = block.get("Text", "")
+    confidence = block.get("Confidence", 0)
+    print(f"A: {answer} (confidence: {confidence:.1f}%)")
diff --git a/integrations/amazon_textract/examples/text_extraction_example.py b/integrations/amazon_textract/examples/text_extraction_example.py
@@ -0,0 +1,17 @@
+# To run this example, you will need to:
+# 1) Set `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_DEFAULT_REGION` environment variables
+# 2) Place an image or single-page PDF named `document.png` in the same directory as this script
+#
+# This example demonstrates basic text extraction from a document image using
+# AWS Textract's DetectDocumentText API.
+
+from haystack_integrations.components.converters.amazon_textract import AmazonTextractConverter
+
+converter = AmazonTextractConverter()
+
+results = converter.run(sources=["document.png"])
+
+for doc in results["documents"]:
+    print(f"--- {doc.meta.get('file_path', 'unknown')} (pages: {doc.meta.get('page_count')}) ---")
+    print(doc.content)
+    print()
diff --git a/integrations/amazon_textract/pyproject.toml b/integrations/amazon_textract/pyproject.toml
@@ -157,6 +157,7 @@ ban-relative-imports = "parents"
 [tool.ruff.lint.per-file-ignores]
 # Tests can use magic values, assertions, relative imports, and don't need type annotations
 "tests/**/*" = ["PLR2004", "S101", "TID252", "D", "ANN"]
+"examples/**/*" = ["T201"]
 
 [tool.coverage.run]
 source = ["haystack_integrations"]