Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. SPDX-License-Identifier: MIT-0
The Summarization Service module provides functionality for generating summaries of documents using LLMs through AWS Bedrock.
The Summarization Service processes document text through a Bedrock LLM to extract key information and present it in flexible formats. The service is designed to integrate with the IDP pipeline to enable automatic document summarization with support for any JSON structure returned by LLMs. It dynamically adapts to whatever fields are returned in the JSON response from the model.
The DocumentSummary class provides a flexible container for any JSON structure returned by LLMs:
@dataclass
class DocumentSummary:
"""Flexible model for document summary results that can handle any JSON structure."""
content: Dict[str, Any]
"""The raw content from the summarization result, containing any fields the LLM returned."""
metadata: Dict[str, Any] = field(default_factory=dict)
"""Optional metadata about the summarization process."""
def __getitem__(self, key: str) -> Any:
"""Allow dictionary-like access to summary fields."""
def get(self, key: str, default: Any = None) -> Any:
"""Get a summary field with an optional default value."""
def keys(self) -> List[str]:
"""Get a list of available keys in the summary."""
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary representation."""The DocumentSummarizationResult class contains the comprehensive summarization results:
@dataclass
class DocumentSummarizationResult:
"""Comprehensive summarization result for a document."""
document_id: str
summary: DocumentSummary
execution_time: float = 0.0
output_uri: Optional[str] = None
def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary representation."""
def to_markdown(self) -> str:
"""Convert summarization results to markdown format."""The SummarizationService class handles the core summarization functionality:
class SummarizationService:
"""Service for summarizing documents using various backends."""
def __init__(
self,
region: str = None,
config: Dict[str, Any] = None,
backend: str = "bedrock"
):
# Initialize service with region, config and backend
def process_text(self, text: str, extraction_results: Dict[str, Any] = None) -> DocumentSummary:
# Process raw text to generate a summary with flexible structure
# Optionally include extraction results in the summarization context
def process_document_section(
self,
document: Document,
section_id: str
) -> Document:
# Process a specific section of a document and update the Document object with the summary
# Stores summary results in S3 and updates section.attributes with URIs
def process_document(
self,
document: Document,
store_results: bool = True
) -> Document:
# Process a Document object and update it with summary information
# Checks config.summarization.enabled first - returns early if disabled
# Automatically detects whether to use section-based or whole document summarization
# store_results parameter controls whether to create and store the markdown reportfrom idp_common.summarization.service import SummarizationService
# Initialize service with config
summarization_service = SummarizationService(config=config)
# Summarize text with flexible format
text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit..."
summary = summarization_service.process_text(text)
# Access fields directly using dictionary-like syntax
for key in summary.keys():
print(f"{key}: {summary[key]}")
# Or using get() with optional default value
overview = summary.get('overview', 'No overview available')
key_points = summary.get('key_points', [])
# Summarize text with extraction results
extraction_data = {
"invoice_number": "INV-12345",
"total_amount": 1500.00,
"vendor_name": "ACME Corp",
"line_items": [
{"description": "Product A", "amount": 500.00},
{"description": "Product B", "amount": 1000.00}
]
}
summary = summarization_service.process_text(text, extraction_results=extraction_data)
# The LLM can now reference extracted data in the summary
print(f"Summary with extraction context: {summary.get('overview')}")from idp_common.models import Document
from idp_common.summarization.service import SummarizationService
import json
# Initialize service
summarization_service = SummarizationService(config=config)
# Load document
document = Document.from_s3("your-bucket", "your-document-key")
# Process a specific section
section_id = "section-1"
document = summarization_service.process_document_section(document, section_id)
# Find the processed section
section = next((s for s in document.sections if s.section_id == section_id), None)
if section and section.attributes and 'summary_uri' in section.attributes:
# Access the summary from S3
from idp_common import s3
summary_uri = section.attributes['summary_uri']
summary_content = s3.get_json_content(summary_uri)
# Print the summary content
print(f"Summary for section {section_id} ({section.classification}):")
print(json.dumps(summary_content, indent=2))
# Access the markdown version
markdown_uri = section.attributes['summary_md_uri']
markdown_content = s3.get_text_content(markdown_uri)
print(f"\nMarkdown Summary:\n{markdown_content[:500]}...")
else:
print(f"No summary available for section {section_id}")The service requires configuration with the following structure:
{
"summarization": {
"enabled": true,
"model": "us.amazon.nova-pro-v1:0",
"temperature": 0,
"top_k": 5,
"system_prompt": "You are an expert document summarizer. Create a balanced summary that captures key points.",
"task_prompt": "Summarize the following document:\n\n{DOCUMENT_TEXT}\n\nProvide your summary in JSON format with the following fields:\n- 'brief_summary': A brief 1-2 sentence overview\n- 'detailed_summary': A comprehensive summary with key points\n\nEnsure the response is valid JSON."
}
}- Purpose: Controls whether summarization processing is performed
- Default:
true(for backward compatibility) - Behavior:
true: Summarization processing proceeds normallyfalse: Summarization is skipped entirely with minimal overhead
Cost Optimization: When enabled: false, no LLM API calls are made, resulting in zero summarization costs.
Example - Disabling Summarization:
summarization:
enabled: false # Disables all summarization processing
# Other properties can remain but will be ignored
model: us.anthropic.claude-3-7-sonnet-20250219-v1:0
temperature: 0.0The service can handle any JSON structure returned by the model. You can use any field names in your prompt template:
{
"summarization": {
"task_prompt": "Summarize the following document:\n\n{DOCUMENT_TEXT}\n\nProvide your summary in JSON format with the following fields:\n- 'overview': A brief 1-2 sentence overview\n- 'key_points': A list of the most important points\n- 'sections': A dictionary of section titles and summaries\n- 'entities': Important entities mentioned in the document\n\nEnsure all JSON fields are properly formatted and the response is valid JSON."
}
}Important considerations for the prompt template:
- Always request a valid JSON response format
- Specify the exact fields you want to include
- Include any formatting or style instructions directly in the prompt
- Fields can have any names and be nested as needed
The Summarization Service now supports integration with extraction results, allowing the LLM to generate summaries that are enriched with structured data extracted from documents.
When extraction results are available (from previous extraction stages in the IDP pipeline), the service can automatically include them in the summarization context. This enables the LLM to:
- Cross-reference extracted structured data with document text
- Validate extracted values against document content
- Generate summaries that incorporate both unstructured text and structured data
- Provide context-aware summaries that leverage extraction insights
The service automatically loads extraction results from S3 when available:
# In process_document_section()
if section.extraction_result_uri:
extraction_data = s3.get_json_content(section.extraction_result_uri)
extraction_results = extraction_data.get("inference_result", {})Extraction results are injected into the prompt using the {EXTRACTION_RESULTS} placeholder:
{
"summarization": {
"task_prompt": "Summarize the following document:\n\n{DOCUMENT_TEXT}\n\nExtracted Data:\n{EXTRACTION_RESULTS}\n\nProvide a summary that incorporates both the text content and extracted data."
}
}Important: The {EXTRACTION_RESULTS} placeholder is optional. If it's not in your prompt template, the service will still work normally, just without extraction results.
Extraction results are automatically formatted as JSON with proper indentation:
placeholders["EXTRACTION_RESULTS"] = json.dumps(extraction_results, indent=2)Here's a complete configuration example that leverages extraction results:
summarization:
enabled: true
model: us.amazon.nova-pro-v1:0
temperature: 0
top_k: 5
top_p: 0.1
max_tokens: 4096
system_prompt: |
You are an expert document analyzer. Create comprehensive summaries that integrate
both the document's textual content and any structured data that has been extracted.
task_prompt: |
Analyze the following document:
Document Text:
{DOCUMENT_TEXT}
Extracted Structured Data:
{EXTRACTION_RESULTS}
Provide a comprehensive summary in JSON format with:
- 'overview': Brief document overview
- 'key_findings': Important points from the text
- 'extracted_data_summary': Summary of the extracted structured fields
- 'validation_notes': Any discrepancies between text and extracted data
Ensure the response is valid JSON.from idp_common.summarization.service import SummarizationService
from idp_common.models import Document
# Initialize service
summarization_service = SummarizationService(config=config)
# Load document with sections that have extraction results
document = Document.from_s3("bucket", "document-key")
# Process a section - extraction results are automatically loaded
section_id = "invoice_section"
document = summarization_service.process_document_section(document, section_id)
# The summary will incorporate extraction results if available
section = next(s for s in document.sections if s.section_id == section_id)
summary_content = s3.get_json_content(section.attributes['summary_uri'])
# Summary might include validation notes
print(summary_content.get('validation_notes'))
# Output: "Extracted invoice total of $1,500.00 matches the total shown in the document text."# You can also provide extraction results manually
text = "Invoice from ACME Corp. Total amount due: $1,500.00"
extraction_results = {
"vendor_name": "ACME Corp",
"invoice_total": 1500.00,
"invoice_number": "INV-12345",
"due_date": "2024-12-31"
}
summary = summarization_service.process_text(text, extraction_results)
# Summary can now reference both text and structured data
print(summary.get('overview'))
# Output: "This invoice from ACME Corp (INV-12345) totals $1,500.00 and is due on 2024-12-31."# For whole document summarization, extraction results from all sections are combined
document = Document.from_dict(document_data)
document.sections = [] # Will use whole document approach
# The service combines extraction results from all sections (if any)
document = summarization_service.process_document(document)
# Summary incorporates all extracted data
summary = document.summarization_result.summary
print(summary.get('extracted_data_summary'))The extraction results should be provided as a dictionary with any structure. Common formats include:
# Simple key-value extraction
extraction_results = {
"invoice_number": "INV-12345",
"total_amount": 1500.00,
"vendor_name": "ACME Corp"
}
# Nested structure with line items
extraction_results = {
"header": {
"invoice_number": "INV-12345",
"invoice_date": "2024-01-15",
"vendor": "ACME Corp"
},
"line_items": [
{"description": "Product A", "quantity": 2, "unit_price": 250.00, "total": 500.00},
{"description": "Product B", "quantity": 1, "unit_price": 1000.00, "total": 1000.00}
],
"totals": {
"subtotal": 1500.00,
"tax": 120.00,
"total": 1620.00
}
}- Enhanced Context: The LLM has access to both unstructured text and structured data
- Validation: Can identify discrepancies between extracted data and document text
- Richer Summaries: Summaries can incorporate precise numerical values and structured information
- Consistency: Ensures extracted data is reflected in the narrative summary
- Quality Assurance: LLM can flag potential extraction errors by comparing with text
When sections have extraction results, they are stored at:
s3://{output_bucket}/{document.input_key}/sections/{section_id}/extraction_result.json
The service automatically reads from this location when section.extraction_result_uri is set.
The service gracefully handles cases where extraction results are not available:
# If extraction results fail to load, summarization continues without them
try:
extraction_data = s3.get_json_content(section.extraction_result_uri)
extraction_results = extraction_data.get("inference_result", {})
except Exception as e:
logger.warning(f"Failed to load extraction results: {e}")
# Continues with summarization using only textThe Summarization Service integrates with the IDP pipeline by:
- Reading all available document page text
- Combining the text with page markers
- Sending the text to Bedrock LLM for summarization
- Parsing the JSON response from the LLM with any structure
- Creating a
DocumentSummarywith the parsed JSON content - Creating a
DocumentSummarizationResultobject with results and timing information - Optionally generating a markdown summary report and storing it in S3 (when
store_results=True) - Updating the document with:
summarization_result: Complete result object with summary, timing, and URIsummary_report_uri: S3 URI to the markdown report (only whenstore_results=True)
The main advantage of this service is that it can work with any JSON structure returned by the LLM:
- You can specify any JSON structure in your prompt template
- The service preserves the exact structure returned by the model
- The markdown report dynamically creates sections based on the JSON keys
When store_results=True (the default), the service generates a markdown summary report that is stored in S3 at the location:
s3://{output_bucket}/{document.input_key}/summary/summary.md
The report dynamically creates sections based on the JSON keys returned:
# Document Summary: doc-123
## Overview
This is a brief overview of the document.
## Key Points
- Point 1
- Point 2
- Point 3
## Sections
### Introduction
Content about the introduction
### Main Content
Content about the main points
## Entities
- Entity 1
- Entity 2
Execution time: 1.25 secondsSpecial formatting is applied based on the data type:
- Lists are formatted as bullet points
- Dictionaries are formatted as nested sections
- Strings are presented as-is
The report is generated using the to_markdown() method of the DocumentSummarizationResult class and can be accessed through document.summary_report_uri or document.summarization_result.output_uri.
When store_results=False, the document is still updated with the summary information and the summarization_result object, but no markdown report is generated or stored in S3.
The process_document method now supports two different approaches to document summarization:
When a document has defined sections, the service will:
from idp_common.models import Document
from idp_common.summarization.service import SummarizationService
# Initialize service
summarization_service = SummarizationService(config=config)
# Load document with sections already defined
document = Document.from_dict(document_data)
# Process document - will use section-based approach automatically
document = summarization_service.process_document(document)
# Access the combined summary report
print(f"Summary Report URI: {document.summary_report_uri}")
# Access individual section summaries
for section in document.sections:
if section.attributes and 'summary_uri' in section.attributes:
print(f"Section {section.section_id} summary: {section.attributes['summary_uri']}")This approach:
- Processes each section separately using
process_document_section - Stores individual section summaries in S3
- Combines all section summaries into a comprehensive document summary
- Generates a markdown report with all section summaries
The combined markdown report will include all section summaries in a structured format:
# Document Summary: doc-123
This summary combines results from all document sections.
## Section Summaries
# Section: introduction
[Introduction section summary content]
# Section: financial_data
[Financial data section summary content]
Total execution time: 10.25 secondsWhen a document has no defined sections, the service automatically falls back to summarizing the entire document at once:
from idp_common.models import Document
from idp_common.summarization.service import SummarizationService
# Initialize service
summarization_service = SummarizationService(config=config)
# Load document without sections
document = Document.from_dict(document_data)
document.sections = [] # No sections defined
# Process document - will use whole document approach automatically
document = summarization_service.process_document(document)
# Access the summary report
print(f"Summary Report URI: {document.summary_report_uri}")This approach:
- Combines text from all pages
- Generates a single summary for the entire document
- Stores the summary in S3
The markdown report will follow the standard format based on the JSON fields returned by the model.
from idp_common.models import Document
from idp_common.summarization.service import SummarizationService
# Initialize service
summarization_service = SummarizationService(config=config)
# Load document
document = Document.from_dict(document_data)
# Process document and store markdown results in S3 (default)
# Will automatically use section-based or whole document approach
document = summarization_service.process_document(document)
# Access summary through the result object
summary = document.summarization_result.summary
print(f"Available fields: {summary.keys()}")
# Access any field directly
for field in summary.keys():
print(f"{field}: {summary[field]}")
# Access execution info and report URI
print(f"Execution Time: {document.summarization_result.execution_time:.2f} seconds")
print(f"Summary Report URI: {document.summary_report_uri}")
# Process document without storing results in S3
document = summarization_service.process_document(document, store_results=False)
# Document has summarization_result but no summary_report_uri
print(f"Has summarization_result: {document.summarization_result is not None}")
print(f"Has summary_report_uri: {document.summary_report_uri is not None}")The process_document_section method allows you to generate summaries for specific sections of a document. This is particularly useful for multi-class documents where different sections may require different types of summaries.
- Input: Takes a Document object and a section_id
- Processing:
- Validates the document and finds the specified section
- Extracts text from all pages in the section
- Generates a summary using the Bedrock LLM
- Stores the summary in S3 in both JSON and Markdown formats
- Output:
- Updates the section's attributes with links to the summary files
- Returns the updated Document object
- Section-specific processing: Focuses only on the pages in the specified section
- Attribute initialization: Safely initializes
section.attributesto an empty dictionary if it'sNone - Dual format storage: Stores both JSON and Markdown versions of the summary
- Error handling: Gracefully handles errors and updates the document's error list
For a section with ID section-id, the summaries are stored at:
- JSON:
s3://{output_bucket}/{document.input_key}/sections/{section_id}/summary.json - Markdown:
s3://{output_bucket}/{document.input_key}/sections/{section_id}/summary.md
After processing, the section's attributes will contain:
summary_uri: S3 URI for the JSON summarysummary_md_uri: S3 URI for the Markdown summary
You can process multiple sections sequentially:
# Process all sections in a document
for section in document.sections:
document = summarization_service.process_document_section(
document=document,
section_id=section.section_id
)Or process them in parallel for better performance:
from concurrent.futures import ThreadPoolExecutor
def process_section(section_id):
return summarization_service.process_document_section(
document=document.copy(), # Create a copy to avoid concurrency issues
section_id=section_id
)
# Process sections in parallel
with ThreadPoolExecutor(max_workers=4) as executor:
section_ids = [section.section_id for section in document.sections]
results = list(executor.map(process_section, section_ids))
# Merge results if needed
# (This is a simplified example - you would need to merge the results properly)
for result_doc in results:
# Update the original document with section results
for section in result_doc.sections:
# Find matching section in original document
orig_section = next((s for s in document.sections if s.section_id == section.section_id), None)
if orig_section and section.attributes:
if orig_section.attributes is None:
orig_section.attributes = {}
orig_section.attributes.update(section.attributes)