-
Notifications
You must be signed in to change notification settings - Fork 88
Expand file tree
/
Copy pathbasic_processing.py
More file actions
103 lines (82 loc) · 2.98 KB
/
basic_processing.py
File metadata and controls
103 lines (82 loc) · 2.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python3
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0
"""
Basic Document Processing Example
This example demonstrates how to use the IDP SDK for basic document processing:
1. Process documents from a local directory
2. Monitor batch progress
3. Download results
Usage:
python basic_processing.py --stack-name my-idp-stack --directory ./samples/
"""
import argparse
import sys
import time
from pathlib import Path
# Add parent to path for development
sys.path.insert(0, str(Path(__file__).parent.parent))
from idp_sdk import IDPClient
def main():
parser = argparse.ArgumentParser(description="Basic IDP document processing")
parser.add_argument(
"--stack-name", required=True, help="IDP CloudFormation stack name"
)
parser.add_argument("--region", default="us-west-2", help="AWS region")
parser.add_argument(
"--directory", required=True, help="Directory containing documents"
)
parser.add_argument(
"--output-dir", default="./results", help="Output directory for results"
)
parser.add_argument(
"--number-of-files",
type=int,
default=None,
help="Limit number of files to process",
)
args = parser.parse_args()
# Initialize client with stack
client = IDPClient(stack_name=args.stack_name, region=args.region)
print(f"Processing documents from: {args.directory}")
# Submit batch for processing
batch_result = client.batch.process(
source=args.directory,
batch_prefix="sdk-example",
file_pattern="*.pdf",
number_of_files=args.number_of_files,
)
print(f"Batch submitted: {batch_result.batch_id}")
print(f"Documents queued: {batch_result.queued}")
print(f"Document IDs: {batch_result.document_ids[:5]}...") # Show first 5
# Monitor progress
print("\nMonitoring progress...")
while True:
status = client.batch.get_status(batch_id=batch_result.batch_id)
print(
f" Completed: {status.completed}/{status.total} "
f"(Failed: {status.failed}, In Progress: {status.in_progress})"
)
if status.all_complete:
print(f"\nBatch complete! Success rate: {status.success_rate:.1%}")
break
time.sleep(10) # Poll every 10 seconds
# Download results
print(f"\nDownloading results to: {args.output_dir}")
download_result = client.batch.download_results(
batch_id=batch_result.batch_id,
output_dir=args.output_dir,
file_types=["summary", "sections"],
)
print(
f"Downloaded {download_result.files_downloaded} files "
f"for {download_result.documents_downloaded} documents"
)
# Show any failed documents
if status.failed > 0:
print("\nFailed documents:")
for doc in status.documents:
if doc.status.value == "FAILED":
print(f" - {doc.document_id}: {doc.error}")
if __name__ == "__main__":
main()