Skip to content

Commit 543e1a6

Browse files
committed
feat: add Amazon Textract examples
1 parent 4690039 commit 543e1a6

4 files changed

Lines changed: 77 additions & 0 deletions

File tree

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# To run this example, you will need to:
2+
# 1) Set `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_DEFAULT_REGION` environment variables
3+
# 2) Place a document image named `invoice.png` in the same directory as this script
4+
#
5+
# This example demonstrates structural analysis using AWS Textract's AnalyzeDocument API.
6+
# Setting `feature_types` enables extraction of tables, forms, and layout information
7+
# in addition to plain text.
8+
9+
from haystack_integrations.components.converters.amazon_textract import AmazonTextractConverter
10+
11+
converter = AmazonTextractConverter(feature_types=["TABLES", "FORMS"])
12+
13+
results = converter.run(sources=["invoice.png"])
14+
15+
for doc in results["documents"]:
16+
print(f"--- {doc.meta.get('file_path', 'unknown')} ---")
17+
print(doc.content)
18+
print()
19+
20+
raw = results["raw_textract_response"][0]
21+
table_blocks = [b for b in raw.get("Blocks", []) if b.get("BlockType") == "TABLE"]
22+
print(f"Tables found: {len(table_blocks)}")
23+
24+
form_blocks = [b for b in raw.get("Blocks", []) if b.get("BlockType") == "KEY_VALUE_SET"]
25+
print(f"Key-value pairs found: {len(form_blocks)}")
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# To run this example, you will need to:
2+
# 1) Set `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_DEFAULT_REGION` environment variables
3+
# 2) Place a document image named `medical_form.png` in the same directory as this script
4+
#
5+
# This example demonstrates natural-language queries using AWS Textract.
6+
# The QUERIES feature type is enabled automatically when you pass the `queries`
7+
# parameter at runtime. Textract will attempt to find answers to each question
8+
# in the document.
9+
10+
from haystack_integrations.components.converters.amazon_textract import AmazonTextractConverter
11+
12+
converter = AmazonTextractConverter()
13+
14+
results = converter.run(
15+
sources=["medical_form.png"],
16+
queries=["What is the patient name?", "What is the date of birth?", "What is the diagnosis?"],
17+
)
18+
19+
for doc in results["documents"]:
20+
print("--- Extracted text ---")
21+
print(doc.content)
22+
print()
23+
24+
raw = results["raw_textract_response"][0]
25+
query_blocks = [b for b in raw.get("Blocks", []) if b.get("BlockType") == "QUERY"]
26+
for block in query_blocks:
27+
question = block.get("Query", {}).get("Text", "")
28+
print(f"Q: {question}")
29+
30+
query_result_blocks = [b for b in raw.get("Blocks", []) if b.get("BlockType") == "QUERY_RESULT"]
31+
for block in query_result_blocks:
32+
answer = block.get("Text", "")
33+
confidence = block.get("Confidence", 0)
34+
print(f"A: {answer} (confidence: {confidence:.1f}%)")
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# To run this example, you will need to:
2+
# 1) Set `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_DEFAULT_REGION` environment variables
3+
# 2) Place an image or single-page PDF named `document.png` in the same directory as this script
4+
#
5+
# This example demonstrates basic text extraction from a document image using
6+
# AWS Textract's DetectDocumentText API.
7+
8+
from haystack_integrations.components.converters.amazon_textract import AmazonTextractConverter
9+
10+
converter = AmazonTextractConverter()
11+
12+
results = converter.run(sources=["document.png"])
13+
14+
for doc in results["documents"]:
15+
print(f"--- {doc.meta.get('file_path', 'unknown')} (pages: {doc.meta.get('page_count')}) ---")
16+
print(doc.content)
17+
print()

integrations/amazon_textract/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,7 @@ ban-relative-imports = "parents"
157157
[tool.ruff.lint.per-file-ignores]
158158
# Tests can use magic values, assertions, relative imports, and don't need type annotations
159159
"tests/**/*" = ["PLR2004", "S101", "TID252", "D", "ANN"]
160+
"examples/**/*" = ["T201"]
160161

161162
[tool.coverage.run]
162163
source = ["haystack_integrations"]

0 commit comments

Comments
 (0)