accelerated-intelligent-document-processing-on-aws/lib/idp_sdk/examples/basic_processing.py at main · aws-solutions-library-samples/accelerated-intelligent-document-processing-on-aws · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python3
# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

"""
Basic Document Processing Example

This example demonstrates how to use the IDP SDK for basic document processing:
1. Process documents from a local directory
2. Monitor batch progress
3. Download results

Usage:
    python basic_processing.py --stack-name my-idp-stack --directory ./samples/
"""

import argparse
import sys
import time
from pathlib import Path

# Add parent to path for development
sys.path.insert(0, str(Path(__file__).parent.parent))

from idp_sdk import IDPClient


def main():
    parser = argparse.ArgumentParser(description="Basic IDP document processing")
    parser.add_argument(
        "--stack-name", required=True, help="IDP CloudFormation stack name"
    )
    parser.add_argument("--region", default="us-west-2", help="AWS region")
    parser.add_argument(
        "--directory", required=True, help="Directory containing documents"
    )
    parser.add_argument(
        "--output-dir", default="./results", help="Output directory for results"
    )
    parser.add_argument(
        "--number-of-files",
        type=int,
        default=None,
        help="Limit number of files to process",
    )
    args = parser.parse_args()

    # Initialize client with stack
    client = IDPClient(stack_name=args.stack_name, region=args.region)

    print(f"Processing documents from: {args.directory}")

    # Submit batch for processing
    batch_result = client.batch.process(
        source=args.directory,
        batch_prefix="sdk-example",
        file_pattern="*.pdf",
        number_of_files=args.number_of_files,
    )

    print(f"Batch submitted: {batch_result.batch_id}")
    print(f"Documents queued: {batch_result.queued}")
    print(f"Document IDs: {batch_result.document_ids[:5]}...")  # Show first 5

    # Monitor progress
    print("\nMonitoring progress...")
    while True:
        status = client.batch.get_status(batch_id=batch_result.batch_id)

        print(
            f"  Completed: {status.completed}/{status.total} "
            f"(Failed: {status.failed}, In Progress: {status.in_progress})"
        )

        if status.all_complete:
            print(f"\nBatch complete! Success rate: {status.success_rate:.1%}")
            break

        time.sleep(10)  # Poll every 10 seconds

    # Download results
    print(f"\nDownloading results to: {args.output_dir}")
    download_result = client.batch.download_results(
        batch_id=batch_result.batch_id,
        output_dir=args.output_dir,
        file_types=["summary", "sections"],
    )

    print(
        f"Downloaded {download_result.files_downloaded} files "
        f"for {download_result.documents_downloaded} documents"
    )

    # Show any failed documents
    if status.failed > 0:
        print("\nFailed documents:")
        for doc in status.documents:
            if doc.status.value == "FAILED":
                print(f"  - {doc.document_id}: {doc.error}")


if __name__ == "__main__":
    main()