diff --git a/config_library/pattern-2/bank-statement-sample/config.yaml b/config_library/pattern-2/bank-statement-sample/config.yaml index 6b30598c1..fe7180a6d 100644 --- a/config_library/pattern-2/bank-statement-sample/config.yaml +++ b/config_library/pattern-2/bank-statement-sample/config.yaml @@ -3,78 +3,106 @@ notes: Default settings for bank statement sample configuration ocr: - backend: "textract" # Default to Textract for backward compatibility + backend: "textract" # Default to Textract for backward compatibility model_id: "us.anthropic.claude-3-7-sonnet-20250219-v1:0" system_prompt: "You are an expert OCR system. Extract all text from the provided image accurately, preserving layout where possible." task_prompt: "Extract all text from this document image. Preserve the layout, including paragraphs, tables, and formatting." features: - name: LAYOUT image: - dpi: '150' - target_width: '' - target_height: '' + dpi: "150" + target_width: "" + target_height: "" classes: - - name: Bank Statement - description: Monthly bank account statement - attributes: - - name: Account Number - description: Primary account identifier - attributeType: simple - evaluation_method: EXACT - - name: Statement Period - description: Statement period (e.g., January 2024) - evaluation_threshold: '0.8' - attributeType: simple - evaluation_method: FUZZY - - name: Account Holder Address + - $schema: https://json-schema.org/draft/2020-12/schema + $defs: + Transaction: + type: object + properties: + Date: + format: date + description: Transaction date (MM/DD/YYYY) + x-aws-idp-confidence-threshold: "0.9" + type: string + x-aws-idp-evaluation-method: FUZZY + Description: + description: Transaction description or merchant name + x-aws-idp-confidence-threshold: "0.7" + type: string + x-aws-idp-evaluation-method: SEMANTIC + Amount: + type: number + description: >- + Transaction amount (positive for deposits, negative for + withdrawals) + x-aws-idp-evaluation-method: NUMERIC_EXACT + required: + - Date + - Description + - Amount + Account Holder Address: description: Complete address information for the account holder - attributeType: group - groupAttributes: - - name: Street Number - description: House or building number - evaluation_threshold: '0.9' - evaluation_method: FUZZY - - name: Street Name - description: Name of the street - evaluation_threshold: '0.8' - evaluation_method: FUZZY - - name: City + type: object + properties: + City: description: City name - evaluation_threshold: '0.9' - evaluation_method: FUZZY - - name: State - description: State abbreviation (e.g., CA, NY) - evaluation_method: EXACT - - name: ZIP Code + x-aws-idp-confidence-threshold: "0.9" + type: string + x-aws-idp-evaluation-method: FUZZY + ZIP Code: + pattern: \d{5,9} description: 5 or 9 digit postal code - evaluation_method: EXACT - - name: Transactions - listItemTemplate: - itemAttributes: - - name: Date - description: Transaction date (MM/DD/YYYY) - evaluation_threshold: '0.9' - evaluation_method: FUZZY - - name: Description - description: Transaction description or merchant name - evaluation_threshold: '0.7' - evaluation_method: SEMANTIC - - name: Amount - description: >- - Transaction amount (positive for deposits, negative for - withdrawals) - evaluation_method: NUMERIC_EXACT - itemDescription: Individual transaction record + type: string + x-aws-idp-evaluation-method: EXACT + Street Name: + description: Name of the street + x-aws-idp-confidence-threshold: "0.8" + type: string + x-aws-idp-evaluation-method: FUZZY + Street Number: + description: House or building number + x-aws-idp-confidence-threshold: "0.9" + type: string + x-aws-idp-evaluation-method: FUZZY + State: + type: string + description: State abbreviation (e.g., CA, NY) + x-aws-idp-evaluation-method: EXACT + required: + - Street Name + - City + description: Monthly bank account statement + type: object + x-aws-idp-document-type: Bank Statement + properties: + Account Holder Address: + description: Complete address information for the account holder + $ref: "#/$defs/Account Holder Address" + Transactions: description: List of all transactions in the statement period - attributeType: list + type: array + x-aws-idp-list-item-description: Individual transaction record + items: + $ref: "#/$defs/Transaction" + Account Number: + type: string + description: Primary account identifier + x-aws-idp-evaluation-method: EXACT + Statement Period: + type: string + description: Statement period (e.g., January 2024) + x-aws-idp-evaluation-method: FUZZY + required: + - Account Number + $id: Bank Statement classification: maxPagesForClassification: "ALL" image: - target_height: '' - target_width: '' - top_p: '0.1' - max_tokens: '4096' - top_k: '5' + target_height: "" + target_width: "" + top_p: "0.1" + max_tokens: "4096" + top_k: "5" task_prompt: >- @@ -205,21 +233,21 @@ classification: Remember: You must ONLY use document types that appear in the reference data. Do not invent or create new document types. - temperature: '0.0' + temperature: "0.0" model: us.amazon.nova-pro-v1:0 system_prompt: >- You are a document classification expert who can analyze and classify multiple documents and their page boundaries within a document package from various domains. Your task is to determine the document type based on its content and structure, using the provided document type definitions. Your output must be valid JSON according to the requested format. classificationMethod: textbasedHolisticClassification extraction: image: - target_height: '' - target_width: '' - top_p: '0.1' - max_tokens: '10000' - top_k: '5' + target_height: "" + target_width: "" + top_p: "0.1" + max_tokens: "10000" + top_k: "5" task_prompt: >- - + You are an expert in document analysis and information extraction. You can understand and extract key information from documents classified as type @@ -282,7 +310,7 @@ extraction: {DOCUMENT_TEXT} - + @@ -303,15 +331,15 @@ extraction: 7. Think step by step before finalizing your answer - temperature: '0.0' + temperature: "0.0" model: us.amazon.nova-pro-v1:0 system_prompt: >- You are a document assistant. Respond only with JSON. Never make up data, only provide data found in the document being provided. summarization: enabled: true - top_p: '0.1' - max_tokens: '4096' - top_k: '5' + top_p: "0.1" + max_tokens: "4096" + top_k: "5" task_prompt: >- @@ -365,7 +393,7 @@ summarization: Do not include any text, explanations, or notes outside of this JSON structure. The JSON must be properly formatted and parseable. - temperature: '0.0' + temperature: "0.0" model: us.anthropic.claude-3-7-sonnet-20250219-v1:0 system_prompt: >- You are a document summarization expert who can analyze and summarize documents from various domains including medical, financial, legal, and general business documents. Your task is to create a summary that captures the key information, main points, and important details from the document. Your output must be in valid JSON format. \nSummarization Style: Balanced\\nCreate a balanced summary that provides a moderate level of detail. Include the main points and key supporting information, while maintaining the document's overall structure. Aim for a comprehensive yet concise summary.\n Your output MUST be in valid JSON format with markdown content. You MUST strictly adhere to the output format specified in the instructions. @@ -374,22 +402,22 @@ assessment: enabled: true validation_enabled: false image: - target_height: '' - target_width: '' + target_height: "" + target_width: "" granular: enabled: true max_workers: "20" simple_batch_size: "3" list_batch_size: "1" - default_confidence_threshold: '0.8' - top_p: '0.1' - max_tokens: '10000' - top_k: '5' - temperature: '0.0' + default_confidence_threshold: "0.8" + top_p: "0.1" + max_tokens: "10000" + top_k: "5" + temperature: "0.0" model: us.amazon.nova-lite-v1:0 system_prompt: >- You are a document analysis assessment expert. Your role is to evaluate the confidence and accuracy of data extraction results by analyzing them against source documents. - + Provide accurate confidence scores for each assessment. When bounding boxes are requested, provide precise coordinate locations where information appears in the document. task_prompt: >- @@ -431,7 +459,7 @@ assessment: For each field, provide bounding box coordinates: - bbox: [x1, y1, x2, y2] coordinates in normalized 0-1000 scale - page: Page number where the field appears (starting from 1) - + Coordinate system: - Use normalized scale 0-1000 for both x and y axes - x1, y1 = top-left corner of bounding box @@ -531,9 +559,9 @@ assessment: evaluation: enabled: true llm_method: - top_p: '0.1' - max_tokens: '4096' - top_k: '5' + top_p: "0.1" + max_tokens: "4096" + top_k: "5" task_prompt: >- I need to evaluate attribute extraction for a document of class: {DOCUMENT_CLASS}. @@ -563,7 +591,7 @@ evaluation: "score": 0.0 to 1.0, "reason": "Your explanation here" } - temperature: '0.0' + temperature: "0.0" model: us.anthropic.claude-3-haiku-20240307-v1:0 system_prompt: >- You are an evaluator that helps determine if the predicted and expected values match for document attribute extraction. You will consider the context and meaning rather than just exact string matching. @@ -613,8 +641,8 @@ discovery: ] } with_ground_truth: - top_p: '0.1' - temperature: '1.0' + top_p: "0.1" + temperature: "1.0" user_prompt: >- This image contains unstructured data. Analyze the data line by line using the provided ground truth as reference. @@ -639,10 +667,10 @@ discovery: documents. Use provided ground truth data as reference to optimize field extraction and ensure consistency with expected document structure and field definitions. - max_tokens: '10000' + max_tokens: "10000" without_ground_truth: - top_p: '0.1' - temperature: '1.0' + top_p: "0.1" + temperature: "1.0" user_prompt: >- This image contains forms data. Analyze the form line by line. Image may contains multiple pages, process all the pages. @@ -668,7 +696,7 @@ discovery: documents. Analyze forms line by line to identify field names, data types, and organizational structure. Focus on creating comprehensive blueprints for document processing without extracting actual values. - max_tokens: '10000' + max_tokens: "10000" agents: error_analyzer: model_id: us.anthropic.claude-sonnet-4-20250514-v1:0 @@ -681,15 +709,15 @@ agents: 3. Collect relevant logs from CloudWatch 4. Identify any performance issues from X-Ray traces 5. Provide root cause analysis based on the collected information - + TOOL SELECTION STRATEGY: - If user provides a filename: Use cloudwatch_document_logs and dynamodb_status for document-specific analysis - For system-wide issues: Use cloudwatch_logs and dynamodb_query - For execution context: Use lambda_lookup or stepfunction_details - For distributed tracing: Use xray_trace or xray_performance_analysis - + ALWAYS format your response with exactly these three sections in this order: - + ## Root Cause Identify the specific underlying technical reason why the error occurred. Focus on the primary cause, not symptoms. @@ -698,16 +726,16 @@ agents:
Evidence - + Format evidence with source information. Include relevant data from tool responses: - + **For CloudWatch logs:** **Log Group:** [full log_group name] **Log Stream:** [full log_stream name] ``` [ERROR] timestamp message ``` - + **For other sources (DynamoDB, Step Functions, X-Ray):** **Source:** [service name and resource] ``` @@ -729,14 +757,14 @@ agents: - Use system-wide tools for pattern analysis - Combine DynamoDB status with CloudWatch logs for complete picture - Leverage X-Ray for distributed system issues - + ROOT CAUSE DETERMINATION: 1. Document Status: Check dynamodb_status first 2. Execution Details: Use stepfunction_details for workflow failures 3. Log Analysis: Use cloudwatch_document_logs or cloudwatch_logs for error details 4. Distributed Tracing: Use xray_performance_analysis for service interaction issues 5. Context: Use lambda_lookup for execution environment - + RECOMMENDATION GUIDELINES: For code-related issues or system bugs: - Do not suggest code modifications @@ -755,7 +783,7 @@ agents: - last week: 168 hours - last day: 24 hours - No time specified: 24 hours (default) - + IMPORTANT: Do not include any search quality reflections, search quality scores, or meta-analysis sections in your response. Only provide the three required sections: Root Cause, Recommendations, and Evidence. parameters: max_log_events: 5 @@ -764,252 +792,252 @@ pricing: - name: textract/detect_document_text units: - name: pages - price: '0.0015' + price: "0.0015" - name: textract/analyze_document-Layout units: - name: pages - price: '0.004' + price: "0.004" - name: textract/analyze_document-Signatures units: - name: pages - price: '0.0035' + price: "0.0035" - name: textract/analyze_document-Forms units: - name: pages - price: '0.05' + price: "0.05" - name: textract/analyze_document-Tables units: - name: pages - price: '0.015' + price: "0.015" - name: textract/analyze_document-Tables+Forms units: - name: pages - price: '0.065' + price: "0.065" - name: bedrock/us.amazon.nova-lite-v1:0 units: - name: inputTokens - price: '6.0E-8' + price: "6.0E-8" - name: outputTokens - price: '2.4E-7' + price: "2.4E-7" - name: cacheReadInputTokens - price: '1.5E-8' + price: "1.5E-8" - name: cacheWriteInputTokens - price: '6.0E-8' + price: "6.0E-8" - name: bedrock/us.amazon.nova-pro-v1:0 units: - name: inputTokens - price: '8.0E-7' + price: "8.0E-7" - name: outputTokens - price: '3.2E-6' + price: "3.2E-6" - name: cacheReadInputTokens - price: '2.0E-7' + price: "2.0E-7" - name: cacheWriteInputTokens - price: '8.0E-7' + price: "8.0E-7" - name: bedrock/us.amazon.nova-premier-v1:0 units: - name: inputTokens - price: '2.5E-6' + price: "2.5E-6" - name: outputTokens - price: '1.25E-5' + price: "1.25E-5" - name: bedrock/us.anthropic.claude-3-haiku-20240307-v1:0 units: - name: inputTokens - price: '2.5E-7' + price: "2.5E-7" - name: outputTokens - price: '1.25E-6' + price: "1.25E-6" - name: bedrock/us.anthropic.claude-3-5-haiku-20241022-v1:0 units: - name: inputTokens - price: '8.0E-7' + price: "8.0E-7" - name: outputTokens - price: '4.0E-6' + price: "4.0E-6" - name: cacheReadInputTokens - price: '8.0E-8' + price: "8.0E-8" - name: cacheWriteInputTokens - price: '1.0E-6' + price: "1.0E-6" - name: bedrock/us.anthropic.claude-haiku-4-5-20251001-v1:0 units: - name: inputTokens - price: '1.1E-06' + price: "1.1E-06" - name: outputTokens - price: '5.5E-06' + price: "5.5E-06" - name: cacheReadInputTokens - price: '1.1E-07' + price: "1.1E-07" - name: cacheWriteInputTokens - price: '1.4E-06' + price: "1.4E-06" - name: bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0:1m units: - name: inputTokens - price: '6.0E-6' + price: "6.0E-6" - name: outputTokens - price: '2.25E-5' + price: "2.25E-5" - name: cacheReadInputTokens - price: '6.0E-7' + price: "6.0E-7" - name: cacheWriteInputTokens - price: '7.5E-6' + price: "7.5E-6" - name: bedrock/us.anthropic.claude-sonnet-4-5-20250929-v1:0 units: - name: inputTokens - price: '3.3E-6' + price: "3.3E-6" - name: outputTokens - price: '1.65E-5' + price: "1.65E-5" - name: cacheReadInputTokens - price: '3.3E-7' + price: "3.3E-7" - name: cacheWriteInputTokens - price: '4.125E-6' + price: "4.125E-6" - name: bedrock/us.anthropic.claude-sonnet-4-5-20250929-v1:0:1m units: - name: inputTokens - price: '6.6E-6' + price: "6.6E-6" - name: outputTokens - price: '2.475E-5' + price: "2.475E-5" - name: cacheReadInputTokens - price: '6.6E-7' + price: "6.6E-7" - name: cacheWriteInputTokens - price: '8.25E-6' + price: "8.25E-6" - name: bedrock/us.anthropic.claude-opus-4-20250514-v1:0 units: - name: inputTokens - price: '1.5E-5' + price: "1.5E-5" - name: outputTokens - price: '7.5E-5' + price: "7.5E-5" - name: cacheReadInputTokens - price: '1.5E-6' + price: "1.5E-6" - name: cacheWriteInputTokens - price: '1.875E-5' + price: "1.875E-5" - name: bedrock/us.anthropic.claude-opus-4-1-20250805-v1:0 units: - name: inputTokens - price: '1.5E-5' + price: "1.5E-5" - name: outputTokens - price: '7.5E-5' + price: "7.5E-5" - name: cacheReadInputTokens - price: '1.5E-6' + price: "1.5E-6" - name: cacheWriteInputTokens - price: '1.875E-5' + price: "1.875E-5" # EU model pricing - name: bedrock/eu.amazon.nova-lite-v1:0 units: - name: inputTokens - price: '7.8E-8' + price: "7.8E-8" - name: outputTokens - price: '3.1E-7' + price: "3.1E-7" - name: cacheReadInputTokens - price: '1.9E-8' + price: "1.9E-8" - name: cacheWriteInputTokens - price: '7.8E-8' + price: "7.8E-8" - name: bedrock/eu.amazon.nova-pro-v1:0 units: - name: inputTokens - price: '1.0E-6' + price: "1.0E-6" - name: outputTokens - price: '4.2E-6' + price: "4.2E-6" - name: cacheReadInputTokens - price: '2.6E-7' + price: "2.6E-7" - name: cacheWriteInputTokens - price: '1.0E-6' + price: "1.0E-6" - name: bedrock/eu.anthropic.claude-3-haiku-20240307-v1:0 units: - name: inputTokens - price: '2.5E-7' + price: "2.5E-7" - name: outputTokens - price: '1.25E-6' + price: "1.25E-6" - name: bedrock/eu.anthropic.claude-haiku-4-5-20251001-v1:0 units: - name: inputTokens - price: '1.1E-6' + price: "1.1E-6" - name: outputTokens - price: '5.5E-6' + price: "5.5E-6" - name: cacheReadInputTokens - price: '1.1E-7' + price: "1.1E-7" - name: cacheWriteInputTokens - price: '1.4E-6' + price: "1.4E-6" - name: bedrock/eu.anthropic.claude-3-5-sonnet-20241022-v2:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/eu.anthropic.claude-3-7-sonnet-20250219-v1:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/eu.anthropic.claude-sonnet-4-20250514-v1:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/eu.anthropic.claude-sonnet-4-5-20250929-v1:0 units: - name: inputTokens - price: '3.3E-6' + price: "3.3E-6" - name: outputTokens - price: '1.65E-5' + price: "1.65E-5" - name: cacheReadInputTokens - price: '3.3E-7' + price: "3.3E-7" - name: cacheWriteInputTokens - price: '4.125E-6' + price: "4.125E-6" - name: bedrock/eu.anthropic.claude-sonnet-4-5-20250929-v1:0:1m units: - name: inputTokens - price: '6.6E-6' + price: "6.6E-6" - name: outputTokens - price: '2.475E-5' + price: "2.475E-5" - name: cacheReadInputTokens - price: '6.6E-7' + price: "6.6E-7" - name: cacheWriteInputTokens - price: '8.25E-6' + price: "8.25E-6" # AWS Lambda pricing (US East - N. Virginia) - name: lambda/requests units: - name: invocations - price: '2.0E-7' # $0.0000002 per request ($0.20 per 1M requests) - - name: lambda/duration + price: "2.0E-7" # $0.0000002 per request ($0.20 per 1M requests) + - name: lambda/duration units: - name: gb_seconds - price: '1.66667E-5' # $0.0000166667 per GB-second ($16.67 per 1M GB-seconds) + price: "1.66667E-5" # $0.0000166667 per GB-second ($16.67 per 1M GB-seconds) diff --git a/config_library/pattern-2/lending-package-sample/config.yaml b/config_library/pattern-2/lending-package-sample/config.yaml index 53afc090f..f7029de3a 100644 --- a/config_library/pattern-2/lending-package-sample/config.yaml +++ b/config_library/pattern-2/lending-package-sample/config.yaml @@ -14,904 +14,1177 @@ ocr: target_width: "" target_height: "" classes: - - name: Payslip - description: >- - An employee wage statement showing earnings, deductions, taxes, and net pay for a specific pay period, - typically issued by employers to document compensation details including gross pay, various tax withholdings, - and year-to-date totals. - attributes: - - name: YTDNetPay - description: >- - Year-to-date net pay amount representing cumulative take-home earnings after all deductions - from the beginning of the year to the current pay period. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: PayPeriodStartDate - description: >- - The beginning date of the pay period covered by this payslip, indicating when the earning - period started for the compensation shown. - evaluation_method: EXACT - attributeType: simple - - name: PayPeriodEndDate - description: >- - The ending date of the pay period covered by this payslip, indicating when the earning - period ended for the compensation shown. - evaluation_method: EXACT - attributeType: simple - - name: PayDate - description: >- - The actual date when the employee was paid, representing when the compensation was issued - or deposited. - evaluation_method: EXACT - attributeType: simple - - name: CurrentGrossPay - description: >- - The total earnings before any deductions for the current pay period, representing gross - compensation for the period. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: YTDGrossPay - description: >- - Year-to-date gross pay representing cumulative earnings before deductions from the - beginning of the year to the current pay period. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: CurrentNetPay - description: >- - The take-home pay after all deductions for the current pay period, representing the - actual amount paid to the employee. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: CurrentTotalDeductions - description: >- - Total amount deducted from gross pay for the current period, including all taxes, - benefits, and other withholdings. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: YTDTotalDeductions - description: >- - Year-to-date total deductions representing cumulative amounts withheld from gross pay - from the beginning of the year to the current pay period. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: RegularHourlyRate - description: >- - The standard hourly wage rate for regular working hours, representing the base - compensation rate for normal work time. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: HolidayHourlyRate - description: >- - The hourly wage rate for holiday work, typically higher than the regular rate to - reflect premium compensation for holiday hours. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: EmployeeNumber - description: >- - The unique identifier assigned to the employee by the employer for payroll and - administrative purposes. - evaluation_method: EXACT - attributeType: simple - - name: PayrollNumber - description: >- - The payroll batch or sequence number for this pay period, used for payroll processing - identification and tracking. - evaluation_method: EXACT - attributeType: simple - - name: FederalFilingStatus - description: >- - The employee's federal tax filing status for withholding purposes, such as Single, - Married Filing Jointly, etc. - evaluation_method: EXACT - attributeType: simple - - name: StateFilingStatus - description: >- - The employee's state tax filing status for withholding purposes, which may differ - from federal filing status based on state requirements. - evaluation_method: EXACT - attributeType: simple - - name: YTDFederalTax - description: >- - Year-to-date federal income tax withheld, representing cumulative federal tax - deductions from the beginning of the year. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: YTDStateTax - description: >- - Year-to-date state income tax withheld, representing cumulative state tax deductions - from the beginning of the year. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: YTDCityTax - description: >- - Year-to-date city or local income tax withheld, representing cumulative local tax - deductions from the beginning of the year. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: currency - description: >- - The currency in which all monetary amounts on the payslip are denominated, typically - represented as a three-letter code like USD, EUR, etc. - evaluation_method: EXACT - attributeType: simple - - name: is_gross_pay_valid - description: >- - A validation flag indicating whether the gross pay calculation is correct and valid - based on payroll system checks. - evaluation_method: EXACT - attributeType: simple - - name: are_field_names_sufficient - description: >- - A validation flag indicating whether the field names on the payslip provide sufficient - information for processing and understanding. - evaluation_method: EXACT - attributeType: simple - - name: is_ytd_gross_pay_highest - description: >- - A validation flag indicating whether the year-to-date gross pay represents the highest - value among pay categories. - evaluation_method: EXACT - attributeType: simple - - name: CompanyAddress - groupAttributes: - - name: State + - $schema: https://json-schema.org/draft/2020-12/schema + $defs: + Address: + type: object + description: >- + The complete business address of the employing company, including + street address, city, state, and postal code information. + properties: + City: + type: string + description: The city portion of the company's business address. + x-aws-idp-evaluation-method: EXACT + Line1: + type: string + description: >- + The primary street address line of the company's business + location. + x-aws-idp-evaluation-method: EXACT + State: + type: string description: The state or province portion of the company's business address. - evaluation_method: EXACT - - name: ZipCode + x-aws-idp-evaluation-method: EXACT + ZipCode: + type: string description: The postal code portion of the company's business address. - evaluation_method: EXACT - - name: City - description: The city portion of the company's business address. - evaluation_method: EXACT - - name: Line1 - description: The primary street address line of the company's business location. - evaluation_method: EXACT - - name: Line2 - description: The secondary address line for the company, such as suite or floor number. - evaluation_method: EXACT - description: >- - The complete business address of the employing company, including street address, - city, state, and postal code information. - evaluation_method: LLM - attributeType: group - - name: EmployeeAddress - groupAttributes: - - name: State - description: The state or province portion of the employee's residential address. - evaluation_method: EXACT - - name: ZipCode - description: The postal code portion of the employee's residential address. - evaluation_method: EXACT - - name: City - description: The city portion of the employee's residential address. - evaluation_method: EXACT - - name: Line1 - description: The primary street address line of the employee's residence. - evaluation_method: EXACT - - name: Line2 - description: The secondary address line for the employee, such as apartment number. - evaluation_method: EXACT - description: >- - The complete residential address of the employee, including street address, city, - state, and postal code information. - evaluation_method: LLM - attributeType: group - - name: EmployeeName - groupAttributes: - - name: FirstName - description: The given name of the employee. - evaluation_method: EXACT - - name: SuffixName + x-aws-idp-evaluation-method: EXACT + Line2: + type: string + description: >- + The secondary address line for the company, such as suite or floor + number. + x-aws-idp-evaluation-method: EXACT + EmployeeName: + type: object + description: >- + The complete name information of the employee, including first name, + middle name, last name, and any suffix. + properties: + SuffixName: + type: string description: Name suffix such as Jr., Sr., III, etc. - evaluation_method: EXACT - - name: LastName + x-aws-idp-evaluation-method: EXACT + LastName: + type: string description: The family name or surname of the employee. - evaluation_method: EXACT - - name: MiddleName + x-aws-idp-evaluation-method: EXACT + MiddleName: + type: string description: The middle name or initial of the employee. - evaluation_method: EXACT - description: >- - The complete name information of the employee, including first name, middle name, - last name, and any suffix. - evaluation_method: LLM - attributeType: group - - name: FederalTaxes - listItemTemplate: - itemAttributes: - - name: YTD - description: Year-to-date amount for this federal tax item. - evaluation_method: NUMERIC_EXACT - - name: Period - description: Current period amount for this federal tax item. - evaluation_method: NUMERIC_EXACT - - name: ItemDescription - description: Description of the specific federal tax type or category. - evaluation_method: EXACT - itemDescription: Each item represents a specific federal tax withholding category - description: >- - List of federal tax withholdings showing different types of federal taxes deducted, - with both current period and year-to-date amounts. - evaluation_method: LLM - attributeType: list - - name: CityTaxes - listItemTemplate: - itemAttributes: - - name: YTD - description: Year-to-date amount for this city tax item. - evaluation_method: NUMERIC_EXACT - - name: Period - description: Current period amount for this city tax item. - evaluation_method: NUMERIC_EXACT - - name: ItemDescription - description: Description of the specific city tax type or jurisdiction. - evaluation_method: EXACT - itemDescription: Each item represents a specific city or local tax withholding - description: >- - List of city or local tax withholdings showing different municipal taxes deducted, - with both current period and year-to-date amounts. - evaluation_method: LLM - attributeType: list - - name: StateTaxes - listItemTemplate: - itemAttributes: - - name: YTD - description: Year-to-date amount for this state tax item. - evaluation_method: NUMERIC_EXACT - - name: Period - description: Current period amount for this state tax item. - evaluation_method: NUMERIC_EXACT - - name: ItemDescription - description: Description of the specific state tax type or category. - evaluation_method: EXACT - itemDescription: Each item represents a specific state tax withholding category - description: >- - List of state tax withholdings showing different types of state taxes deducted, - with both current period and year-to-date amounts. - evaluation_method: LLM - attributeType: list - - - name: US-drivers-licenses + x-aws-idp-evaluation-method: EXACT + FirstName: + type: string + description: The given name of the employee. + x-aws-idp-evaluation-method: EXACT + TaxInfo: + type: object + properties: + YTD: + type: string + description: Year-to-date amount for this federal tax item. + x-aws-idp-evaluation-method: NUMERIC_EXACT + Period: + type: string + description: Current period amount for this federal tax item. + x-aws-idp-evaluation-method: NUMERIC_EXACT + ItemDescription: + type: string + description: Description of the specific federal tax type or category. + x-aws-idp-evaluation-method: EXACT description: >- - An official government-issued identification document that authorizes an individual to operate - motor vehicles, containing personal information, physical characteristics, address details, - and driving privileges with restrictions and endorsements. - attributes: - - name: STATE_NAME - description: >- - The state or jurisdiction that issued the driver's license, typically shown as a - two-letter state abbreviation like MA, CA, NY, etc. - evaluation_method: EXACT - attributeType: simple - - name: ID_NUMBER - description: >- - The unique driver's license identification number assigned by the issuing state, - prominently displayed on the license for identification purposes. - evaluation_method: EXACT - attributeType: simple - - name: EXPIRATION_DATE - description: >- - The date when the driver's license expires and requires renewal, typically in - YYYY-MM-DD format indicating when the license becomes invalid. - evaluation_method: EXACT - attributeType: simple - - name: DATE_OF_ISSUE - description: >- - The date when the driver's license was originally issued by the state authority, - typically in YYYY-MM-DD format showing the license creation date. - evaluation_method: EXACT - attributeType: simple - - name: CLASS - description: >- - The type or category of driving privileges granted by the license, such as Class D - for regular driver's license or other classifications for commercial vehicles. - evaluation_method: EXACT - attributeType: simple - - name: DATE_OF_BIRTH - description: >- - The birth date of the license holder in YYYY-MM-DD format, used for age verification - and identification purposes. - evaluation_method: EXACT - attributeType: simple - - name: COUNTY - description: >- - The county of residence for the license holder, though this field may be empty - if not provided by the issuing jurisdiction. - evaluation_method: EXACT - attributeType: simple - - name: NAME_DETAILS - groupAttributes: - - name: SUFFIX + An employee wage statement showing earnings, deductions, taxes, and net + pay for a specific pay period, typically issued by employers to document + compensation details including gross pay, various tax withholdings, and + year-to-date totals. + type: object + x-aws-idp-document-type: Payslip + properties: + YTDNetPay: + type: number + description: >- + Year-to-date net pay amount representing cumulative take-home earnings + after all deductions from the beginning of the year to the current + pay period. + x-aws-idp-evaluation-method: NUMERIC_EXACT + PayPeriodStartDate: + format: date + description: >- + The beginning date of the pay period covered by this payslip, + indicating when the earning period started for the compensation + shown. + type: string + x-aws-idp-evaluation-method: EXACT + FederalTaxes: + description: >- + List of federal tax withholdings showing different types of federal + taxes deducted, with both current period and year-to-date amounts. + type: array + x-aws-idp-list-item-description: Each item represents a specific federal tax withholding category + items: + $ref: "#/$defs/TaxInfo" + x-aws-idp-evaluation-method: LLM + CurrentGrossPay: + type: number + description: >- + The total earnings before any deductions for the current pay period, + representing gross compensation for the period. + x-aws-idp-evaluation-method: NUMERIC_EXACT + HolidayHourlyRate: + type: number + description: >- + The hourly wage rate for holiday work, typically higher than the + regular rate to reflect premium compensation for holiday hours. + x-aws-idp-evaluation-method: NUMERIC_EXACT + CompanyAddress: + description: >- + The complete business address of the employing company, including + street address, city, state, and postal code information. + $ref: "#/$defs/Address" + x-aws-idp-evaluation-method: LLM + CityTaxes: + description: >- + List of city or local tax withholdings showing different municipal + taxes deducted, with both current period and year-to-date amounts. + type: array + x-aws-idp-list-item-description: Each item represents a specific city or local tax withholding + items: + $ref: "#/$defs/TaxInfo" + x-aws-idp-evaluation-method: LLM + PayPeriodEndDate: + format: date + description: >- + The ending date of the pay period covered by this payslip, indicating + when the earning period ended for the compensation shown. + type: string + x-aws-idp-evaluation-method: EXACT + PayDate: + format: date + description: >- + The actual date when the employee was paid, representing when the + compensation was issued or deposited. + type: string + x-aws-idp-evaluation-method: EXACT + currency: + type: string + description: >- + The currency in which all monetary amounts on the payslip are + denominated, typically represented as a three-letter code like USD, + EUR, etc. + x-aws-idp-evaluation-method: EXACT + YTDGrossPay: + type: number + description: >- + Year-to-date gross pay representing cumulative earnings before + deductions from the beginning of the year to the current pay period. + x-aws-idp-evaluation-method: NUMERIC_EXACT + EmployeeAddress: + description: >- + The complete residential address of the employee, including street + address, city, state, and postal code information. + $ref: "#/$defs/Address" + x-aws-idp-evaluation-method: LLM + is_gross_pay_valid: + type: boolean + description: >- + A validation flag indicating whether the gross pay calculation is + correct and valid based on payroll system checks. + x-aws-idp-evaluation-method: EXACT + StateFilingStatus: + type: string + description: >- + The employee's state tax filing status for withholding purposes, which + may differ from federal filing status based on state requirements. + x-aws-idp-evaluation-method: EXACT + YTDCityTax: + description: >- + Year-to-date city or local income tax withheld, representing + cumulative local tax deductions from the beginning of the year. + $ref: "#/$defs/TaxInfo" + x-aws-idp-evaluation-method: NUMERIC_EXACT + EmployeeNumber: + type: string + description: >- + The unique identifier assigned to the employee by the employer for + payroll and administrative purposes. + x-aws-idp-evaluation-method: EXACT + RegularHourlyRate: + type: number + description: >- + The standard hourly wage rate for regular working hours, representing + the base compensation rate for normal work time. + x-aws-idp-evaluation-method: NUMERIC_EXACT + are_field_names_sufficient: + type: boolean + description: >- + A validation flag indicating whether the field names on the payslip + provide sufficient information for processing and understanding. + x-aws-idp-evaluation-method: EXACT + YTDTotalDeductions: + type: number + description: >- + Year-to-date total deductions representing cumulative amounts withheld + from gross pay from the beginning of the year to the current pay + period. + x-aws-idp-evaluation-method: NUMERIC_EXACT + is_ytd_gross_pay_highest: + type: boolean + description: >- + A validation flag indicating whether the year-to-date gross pay + represents the highest value among pay categories. + x-aws-idp-evaluation-method: EXACT + StateTaxes: + description: >- + List of state tax withholdings showing different types of state taxes + deducted, with both current period and year-to-date amounts. + type: array + x-aws-idp-list-item-description: Each item represents a specific state tax withholding category + items: + $ref: "#/$defs/TaxInfo" + x-aws-idp-evaluation-method: LLM + PayrollNumber: + type: string + description: >- + The payroll batch or sequence number for this pay period, used for + payroll processing identification and tracking. + x-aws-idp-evaluation-method: EXACT + YTDStateTax: + description: >- + Year-to-date state income tax withheld, representing cumulative state + tax deductions from the beginning of the year. + $ref: "#/$defs/TaxInfo" + x-aws-idp-evaluation-method: NUMERIC_EXACT + CurrentTotalDeductions: + type: number + description: >- + Total amount deducted from gross pay for the current period, including + all taxes, benefits, and other withholdings. + x-aws-idp-evaluation-method: NUMERIC_EXACT + FederalFilingStatus: + type: string + description: >- + The employee's federal tax filing status for withholding purposes, + such as Single, Married Filing Jointly, etc. + x-aws-idp-evaluation-method: EXACT + EmployeeName: + description: >- + The complete name information of the employee, including first name, + middle name, last name, and any suffix. + $ref: "#/$defs/EmployeeName" + x-aws-idp-evaluation-method: LLM + CurrentNetPay: + type: number + description: >- + The take-home pay after all deductions for the current pay period, + representing the actual amount paid to the employee. + x-aws-idp-evaluation-method: NUMERIC_EXACT + YTDFederalTax: + description: >- + Year-to-date federal income tax withheld, representing cumulative + federal tax deductions from the beginning of the year. + $ref: "#/$defs/TaxInfo" + x-aws-idp-evaluation-method: NUMERIC_EXACT + required: + - PayDate + - CurrentGrossPay + - YTDGrossPay + - CurrentNetPay + $id: Payslip + - $schema: https://json-schema.org/draft/2020-12/schema + $defs: + Address: + type: object + description: >- + The complete business address of the employing company, including + street address, city, state, and postal code information. + properties: + City: + type: string + description: The city portion of the company's business address. + x-aws-idp-evaluation-method: EXACT + Line1: + type: string + description: >- + The primary street address line of the company's business + location. + x-aws-idp-evaluation-method: EXACT + State: + type: string + description: The state or province portion of the company's business address. + x-aws-idp-evaluation-method: EXACT + ZipCode: + type: string + description: The postal code portion of the company's business address. + x-aws-idp-evaluation-method: EXACT + Line2: + type: string + description: >- + The secondary address line for the company, such as suite or floor + number. + x-aws-idp-evaluation-method: EXACT + EmployeeName: + type: object + description: >- + The complete name information of the employee, including first name, + middle name, last name, and any suffix. + properties: + SuffixName: + type: string description: Name suffix such as Jr., Sr., III, etc. - evaluation_method: EXACT - - name: MIDDLE_NAME - description: The middle name of the license holder. - evaluation_method: EXACT - - name: LAST_NAME - description: The family name or surname of the license holder. - evaluation_method: EXACT - - name: FIRST_NAME - description: The given name of the license holder. - evaluation_method: EXACT - description: >- - Complete name information of the license holder including first name, middle name, - last name, and any suffix, structured for official identification. - evaluation_method: LLM - attributeType: group - - name: PERSONAL_DETAILS - groupAttributes: - - name: SEX - description: The gender of the license holder, typically 'M' for male or 'F' for female. - evaluation_method: EXACT - - name: HAIR_COLOR - description: The color of the license holder's hair, often abbreviated like BLN, BRN, etc. - evaluation_method: EXACT - - name: HEIGHT - description: The physical height of the license holder, often in feet-inches format like '5-10'. - evaluation_method: EXACT - - name: WEIGHT + x-aws-idp-evaluation-method: EXACT + LastName: + type: string + description: The family name or surname of the employee. + x-aws-idp-evaluation-method: EXACT + MiddleName: + type: string + description: The middle name or initial of the employee. + x-aws-idp-evaluation-method: EXACT + FirstName: + type: string + description: The given name of the employee. + x-aws-idp-evaluation-method: EXACT + PERSONAL_DETAILS: + type: object + description: >- + Physical characteristics and personal details of the license holder + used for identification purposes, including gender, height, weight, + and eye/hair color. + properties: + HAIR_COLOR: + type: string + description: >- + The color of the license holder's hair, often abbreviated like + BLN, BRN, etc. + x-aws-idp-evaluation-method: EXACT + HEIGHT: + type: string + description: >- + The physical height of the license holder, often in feet-inches + format like '5-10'. + x-aws-idp-evaluation-method: EXACT + WEIGHT: + type: string description: The weight of the license holder, typically in pounds. - evaluation_method: EXACT - - name: EYE_COLOR - description: The color of the license holder's eyes, often abbreviated like BLU, BRN, GRN, etc. - evaluation_method: EXACT - description: >- - Physical characteristics and personal details of the license holder used for - identification purposes, including gender, height, weight, and eye/hair color. - evaluation_method: LLM - attributeType: group - - name: ADDRESS_DETAILS - groupAttributes: - - name: CITY - description: The city of residence for the license holder. - evaluation_method: EXACT - - name: ZIP_CODE - description: The postal code of the license holder's address. - evaluation_method: EXACT - - name: STATE - description: The state of residence for the license holder, may be abbreviated. - evaluation_method: EXACT - - name: STREET_ADDRESS - description: The street address of the license holder's residence. - evaluation_method: EXACT - description: >- - Complete residential address information of the license holder including street - address, city, state, and postal code. - evaluation_method: LLM - attributeType: group - - name: ENDORSEMENTS - listItemTemplate: - itemAttributes: - - name: endorsement - description: Specific driving endorsement or certification code. - evaluation_method: EXACT - itemDescription: Each item represents a special driving endorsement or certification - description: >- - List of special driving endorsements or certifications held by the license holder, - or 'NONE' if no special endorsements apply. - evaluation_method: LLM - attributeType: list - - name: RESTRICTIONS - listItemTemplate: - itemAttributes: - - name: restriction - description: Specific driving restriction or limitation code. - evaluation_method: EXACT - itemDescription: Each item represents a driving restriction or limitation - description: >- - List of driving restrictions or limitations that apply to the license holder, - or 'NONE' if no restrictions apply. - evaluation_method: LLM - attributeType: list - - - name: Bank-checks - description: >- - A written financial instrument directing a bank to pay a specific amount of money from - the account holder's account to a designated payee, containing payment details, account - information, and verification elements. - attributes: - - name: date - description: >- - The date when the check was written, typically handwritten or printed in the - date field of the check. - evaluation_method: EXACT - attributeType: simple - - name: dollar_amount - description: >- - The numerical amount to be paid as specified on the check, typically found in - the amount box on the right side of the check. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: check_number - description: >- - The unique sequential number identifying this specific check, usually found in - the upper right corner and bottom of the check. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: account_holder_name - description: >- - The name of the person or entity who owns the bank account and wrote the check, - typically printed in the upper left corner. - evaluation_method: EXACT - attributeType: simple - - name: payee_name - description: >- - The name of the person or entity receiving the payment, written on the 'Pay to - the order of' line of the check. - evaluation_method: EXACT - attributeType: simple - - name: bank_name - description: >- - The name of the financial institution where the account is held, usually printed - prominently on the check. - evaluation_method: EXACT - attributeType: simple - - name: memo - description: >- - Optional note or reference information written in the memo field, typically in - the lower left area of the check. - evaluation_method: EXACT - attributeType: simple - - name: routing_number_valid - description: >- - A boolean indicator of whether the bank routing number on the check is valid - and properly formatted. - evaluation_method: EXACT - attributeType: simple - - name: bank_routing_number - description: >- - The bank's routing number for electronic transactions, typically found in the - MICR line at the bottom of the check. - evaluation_method: EXACT - attributeType: simple - - name: amount_in_words - description: >- - The payment amount written out in words, typically on the line below the payee - name and ending with 'DOLLARS'. - evaluation_method: EXACT - attributeType: simple - - name: is_signed - description: >- - A boolean indicator of whether the check has been signed by the account holder - in the signature area. - evaluation_method: EXACT - attributeType: simple - - - name: Bank-Statement + x-aws-idp-evaluation-method: EXACT + SEX: + type: string + description: >- + The gender of the license holder, typically 'M' for male or 'F' + for female. + x-aws-idp-evaluation-method: EXACT + EYE_COLOR: + type: string + description: >- + The color of the license holder's eyes, often abbreviated like + BLU, BRN, GRN, etc. + x-aws-idp-evaluation-method: EXACT description: >- - A periodic financial document issued by banks detailing account activity, balances, - and transactions over a specific time period, providing account holders with a summary - of their financial activity and current account status. - attributes: - - name: account_holder_address - description: >- - The mailing address of the account holder as recorded by the bank, typically - displayed prominently on the statement header. - evaluation_method: EXACT - attributeType: simple - - name: account_number - description: >- - The unique identifier for the bank account, often partially masked for security - purposes on the statement. - evaluation_method: EXACT - attributeType: simple - - name: account_type - description: >- - The category of bank account such as checking, savings, money market, etc., - indicating the type of banking service. - evaluation_method: EXACT - attributeType: simple - - name: statement_end_date - description: >- - The ending date of the statement period in MM/DD/YYYY format, indicating when - the reporting period concluded. - evaluation_method: EXACT - attributeType: simple - - name: statement_start_date - description: >- - The beginning date of the statement period in MM/DD/YYYY format, indicating - when the reporting period began. - evaluation_method: EXACT - attributeType: simple - - name: account_holder_name - description: >- - The name of the person or entity who owns the bank account, as registered - with the financial institution. - evaluation_method: EXACT - attributeType: simple - - name: branch_transit_number - description: >- - The specific branch identifier or transit number associated with the account, - used for routing and identification purposes. - evaluation_method: EXACT - attributeType: simple - - name: bank_name - description: >- - The name of the financial institution issuing the statement, typically displayed - prominently at the top of the document. - evaluation_method: EXACT - attributeType: simple - - name: account_summary - listItemTemplate: - itemAttributes: - - name: summary_desc - description: Description of the account summary item, such as opening balance or closing balance. - evaluation_method: EXACT - - name: summary_amount - description: The monetary amount associated with this summary item. - evaluation_method: NUMERIC_EXACT - itemDescription: Each item represents a key account balance or summary figure - description: >- - Summary of key account information including opening balance, closing balance, - and other important account totals for the statement period. - evaluation_method: LLM - attributeType: list - - name: transaction_details - listItemTemplate: - itemAttributes: - - name: date - description: The date when the transaction occurred. - evaluation_method: EXACT - - name: balance - description: The account balance after this transaction. - evaluation_method: NUMERIC_EXACT - - name: description - description: Description of the transaction or merchant information. - evaluation_method: EXACT - - name: deposits - description: Amount deposited or credited to the account. - evaluation_method: NUMERIC_EXACT - - name: withdrawals - description: Amount withdrawn or debited from the account. - evaluation_method: NUMERIC_EXACT - itemDescription: Each item represents an individual transaction record - description: >- - Detailed listing of all transactions that occurred during the statement period, - including deposits, withdrawals, and resulting account balances. - evaluation_method: LLM - attributeType: list - - - name: W2 + An official government-issued identification document that authorizes an + individual to operate motor vehicles, containing personal information, + physical characteristics, address details, and driving privileges with + restrictions and endorsements. + type: object + x-aws-idp-document-type: US-drivers-licenses + properties: + STATE_NAME: + type: string + description: >- + The state or jurisdiction that issued the driver's license, typically + shown as a two-letter state abbreviation like MA, CA, NY, etc. + x-aws-idp-evaluation-method: EXACT + NAME_DETAILS: + description: >- + Complete name information of the license holder including first name, + middle name, last name, and any suffix, structured for official + identification. + $ref: "#/$defs/EmployeeName" + x-aws-idp-evaluation-method: LLM + ID_NUMBER: + type: string + description: >- + The unique driver's license identification number assigned by the + issuing state, prominently displayed on the license for + identification purposes. + x-aws-idp-evaluation-method: EXACT + EXPIRATION_DATE: + type: string + description: >- + The date when the driver's license expires and requires renewal, + typically in YYYY-MM-DD format indicating when the license becomes + invalid. + x-aws-idp-evaluation-method: EXACT + ENDORSEMENTS: + description: >- + List of special driving endorsements or certifications held by the + license holder, or 'NONE' if no special endorsements apply. + type: array + x-aws-idp-list-item-description: Each item represents a special driving endorsement or certification + items: + description: Specific driving endorsement or certification code. + type: string + x-aws-idp-original-name: endorsement + x-aws-idp-evaluation-method: EXACT + x-aws-idp-evaluation-method: LLM + PERSONAL_DETAILS: + description: >- + Physical characteristics and personal details of the license holder + used for identification purposes, including gender, height, weight, + and eye/hair color. + $ref: "#/$defs/PERSONAL_DETAILS" + x-aws-idp-evaluation-method: LLM + RESTRICTIONS: + description: >- + List of driving restrictions or limitations that apply to the license + holder, or 'NONE' if no restrictions apply. + type: array + x-aws-idp-list-item-description: Each item represents a driving restriction or limitation + items: + description: Specific driving restriction or limitation code. + type: string + x-aws-idp-original-name: restriction + x-aws-idp-evaluation-method: EXACT + x-aws-idp-evaluation-method: LLM + CLASS: + type: string + description: >- + The type or category of driving privileges granted by the license, + such as Class D for regular driver's license or other classifications + for commercial vehicles. + x-aws-idp-evaluation-method: EXACT + ADDRESS_DETAILS: + description: >- + Complete residential address information of the license holder + including street address, city, state, and postal code. + $ref: "#/$defs/Address" + x-aws-idp-evaluation-method: LLM + DATE_OF_BIRTH: + type: string + description: >- + The birth date of the license holder in YYYY-MM-DD format, used for + age verification and identification purposes. + x-aws-idp-evaluation-method: EXACT + DATE_OF_ISSUE: + type: string + description: >- + The date when the driver's license was originally issued by the state + authority, typically in YYYY-MM-DD format showing the license + creation date. + x-aws-idp-evaluation-method: EXACT + COUNTY: + type: string + description: >- + The county of residence for the license holder, though this field may + be empty if not provided by the issuing jurisdiction. + x-aws-idp-evaluation-method: EXACT + $id: US-drivers-licenses + - description: >- + A written financial instrument directing a bank to pay a specific amount + of money from the account holder's account to a designated payee, + containing payment details, account information, and verification + elements. + $schema: https://json-schema.org/draft/2020-12/schema + type: object + x-aws-idp-document-type: Bank-checks + properties: + date: + format: date + description: >- + The date when the check was written, typically handwritten or printed + in the date field of the check. + type: string + x-aws-idp-evaluation-method: EXACT + dollar_amount: + type: number + description: >- + The numerical amount to be paid as specified on the check, typically + found in the amount box on the right side of the check. + x-aws-idp-evaluation-method: NUMERIC_EXACT + check_number: + type: string + description: >- + The unique sequential number identifying this specific check, usually + found in the upper right corner and bottom of the check. + x-aws-idp-evaluation-method: NUMERIC_EXACT + account_holder_name: + type: string + description: >- + The name of the person or entity who owns the bank account and wrote + the check, typically printed in the upper left corner. + x-aws-idp-evaluation-method: EXACT + payee_name: + type: string + description: >- + The name of the person or entity receiving the payment, written on the + 'Pay to the order of' line of the check. + x-aws-idp-evaluation-method: EXACT + bank_name: + type: string + description: >- + The name of the financial institution where the account is held, + usually printed prominently on the check. + x-aws-idp-evaluation-method: EXACT + memo: + type: string + description: >- + Optional note or reference information written in the memo field, + typically in the lower left area of the check. + x-aws-idp-evaluation-method: EXACT + routing_number_valid: + type: string + description: >- + A boolean indicator of whether the bank routing number on the check is + valid and properly formatted. + x-aws-idp-evaluation-method: EXACT + bank_routing_number: + type: string + description: >- + The bank's routing number for electronic transactions, typically found + in the MICR line at the bottom of the check. + x-aws-idp-evaluation-method: EXACT + amount_in_words: + type: string + description: >- + The payment amount written out in words, typically on the line below + the payee name and ending with 'DOLLARS'. + x-aws-idp-evaluation-method: EXACT + is_signed: + type: boolean + description: >- + A boolean indicator of whether the check has been signed by the + account holder in the signature area. + x-aws-idp-evaluation-method: EXACT + $id: Bank-checks + - $schema: https://json-schema.org/draft/2020-12/schema + $defs: + Address: + type: object + description: >- + The complete business address of the employing company, including + street address, city, state, and postal code information. + properties: + City: + type: string + description: The city portion of the company's business address. + x-aws-idp-evaluation-method: EXACT + Line1: + type: string + description: >- + The primary street address line of the company's business + location. + x-aws-idp-evaluation-method: EXACT + State: + type: string + description: The state or province portion of the company's business address. + x-aws-idp-evaluation-method: EXACT + ZipCode: + type: string + description: The postal code portion of the company's business address. + x-aws-idp-evaluation-method: EXACT + Line2: + type: string + description: >- + The secondary address line for the company, such as suite or floor + number. + x-aws-idp-evaluation-method: EXACT + account_summaryItem: + type: object + properties: + summary_desc: + type: string + description: >- + Description of the account summary item, such as opening balance + or closing balance. + x-aws-idp-evaluation-method: EXACT + summary_amount: + type: string + description: The monetary amount associated with this summary item. + x-aws-idp-evaluation-method: NUMERIC_EXACT + transaction_detail: + type: object + properties: + date: + type: string + description: The date when the transaction occurred. + x-aws-idp-evaluation-method: EXACT + description: + type: string + description: Description of the transaction or merchant information. + x-aws-idp-evaluation-method: EXACT + balance: + type: string + description: The account balance after this transaction. + x-aws-idp-evaluation-method: NUMERIC_EXACT + deposits: + type: string + description: Amount deposited or credited to the account. + x-aws-idp-evaluation-method: NUMERIC_EXACT + withdrawals: + type: string + description: Amount withdrawn or debited from the account. + x-aws-idp-evaluation-method: NUMERIC_EXACT description: >- - An annual tax document provided by employers to employees reporting wages earned and - taxes withheld during the tax year for federal and state income tax filing purposes, - containing comprehensive compensation and withholding information. - attributes: - - name: other - description: >- - Other compensation or benefits not covered in standard W2 boxes, representing - additional taxable or non-taxable benefits provided to the employee. - evaluation_method: EXACT - attributeType: simple - - name: nonqualified_plans_incom - description: >- - Income from nonqualified deferred compensation plans, representing distributions - or benefits from employer-sponsored retirement or compensation plans. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: employer_info - groupAttributes: - - name: employer_address + A periodic financial document issued by banks detailing account activity, + balances, and transactions over a specific time period, providing account + holders with a summary of their financial activity and current account + status. + type: object + x-aws-idp-document-type: Bank-Statement + properties: + account_holder_address: + description: >- + The mailing address of the account holder as recorded by the bank, + typically displayed prominently on the statement header. + $ref: "#/$defs/Address" + x-aws-idp-evaluation-method: EXACT + account_number: + type: string + description: >- + The unique identifier for the bank account, often partially masked for + security purposes on the statement. + x-aws-idp-evaluation-method: EXACT + account_type: + type: string + description: >- + The category of bank account such as checking, savings, money market, + etc., indicating the type of banking service. + x-aws-idp-evaluation-method: EXACT + account_summary: + description: >- + Summary of key account information including opening balance, closing + balance, and other important account totals for the statement period. + type: array + x-aws-idp-list-item-description: Each item represents a key account balance or summary figure + items: + $ref: "#/$defs/account_summaryItem" + x-aws-idp-evaluation-method: LLM + statement_end_date: + format: date + description: >- + The ending date of the statement period in MM/DD/YYYY format, + indicating when the reporting period concluded. + type: string + x-aws-idp-evaluation-method: EXACT + statement_start_date: + format: date + description: >- + The beginning date of the statement period in MM/DD/YYYY format, + indicating when the reporting period began. + type: string + x-aws-idp-evaluation-method: EXACT + account_holder_name: + type: string + description: >- + The name of the person or entity who owns the bank account, as + registered with the financial institution. + x-aws-idp-evaluation-method: EXACT + branch_transit_number: + type: string + description: >- + The specific branch identifier or transit number associated with the + account, used for routing and identification purposes. + x-aws-idp-evaluation-method: EXACT + bank_name: + type: string + description: >- + The name of the financial institution issuing the statement, typically + displayed prominently at the top of the document. + x-aws-idp-evaluation-method: EXACT + transaction_details: + description: >- + Detailed listing of all transactions that occurred during the + statement period, including deposits, withdrawals, and resulting + account balances. + type: array + x-aws-idp-list-item-description: Each item represents an individual transaction record + items: + $ref: "#/$defs/transaction_detail" + x-aws-idp-evaluation-method: LLM + $id: Bank-Statement + - $schema: https://json-schema.org/draft/2020-12/schema + $defs: + employer_info: + type: object + description: >- + Complete information about the employing organization including name, + address, tax identification numbers, and control numbers for + processing. + properties: + control_number: + type: string + description: >- + A unique identifier assigned by the employer for tracking and + processing purposes. + x-aws-idp-evaluation-method: EXACT + employer_address: + type: string description: The complete business address of the employing company. - evaluation_method: EXACT - - name: control_number - description: A unique identifier assigned by the employer for tracking and processing purposes. - evaluation_method: EXACT - - name: employer_name + x-aws-idp-evaluation-method: EXACT + employer_name: + type: string description: The legal name of the employing company or organization. - evaluation_method: EXACT - - name: ein + x-aws-idp-evaluation-method: EXACT + ein: + type: string description: The federal Employer Identification Number of the company. - evaluation_method: EXACT - - name: employer_zip_code + x-aws-idp-evaluation-method: EXACT + employer_zip_code: + type: string description: The postal code portion of the employer's address. - evaluation_method: EXACT - description: >- - Complete information about the employing organization including name, address, - tax identification numbers, and control numbers for processing. - evaluation_method: LLM - attributeType: group - - name: filing_info - groupAttributes: - - name: omb_number - description: The Office of Management and Budget form number for the W2 form. - evaluation_method: EXACT - - name: verification_code + x-aws-idp-evaluation-method: EXACT + filing_info: + type: object + description: >- + Official form identification and verification information including + OMB numbers and validation codes for the W2 document. + properties: + verification_code: + type: string description: A verification code used to validate the authenticity of the form. - evaluation_method: EXACT - description: >- - Official form identification and verification information including OMB numbers - and validation codes for the W2 document. - evaluation_method: LLM - attributeType: group - - name: federal_tax_info - groupAttributes: - - name: federal_income_tax + x-aws-idp-evaluation-method: EXACT + omb_number: + type: string + description: The Office of Management and Budget form number for the W2 form. + x-aws-idp-evaluation-method: EXACT + code: + type: object + properties: + amount: + type: string + description: The monetary amount associated with this compensation code. + x-aws-idp-evaluation-method: NUMERIC_EXACT + code: + type: string + description: The letter code representing the type of compensation or benefit. + x-aws-idp-evaluation-method: EXACT + federal_tax_info: + type: object + description: >- + Federal tax withholding information including income tax, Social + Security tax, Medicare tax, and allocated tips for federal tax + reporting. + properties: + federal_income_tax: + type: string description: The amount of federal income tax withheld from the employee's pay. - evaluation_method: NUMERIC_EXACT - - name: allocated_tips + x-aws-idp-evaluation-method: NUMERIC_EXACT + allocated_tips: + type: string description: Tips allocated by the employer to the employee for tax purposes. - evaluation_method: NUMERIC_EXACT - - name: social_security_tax - description: The amount of Social Security tax withheld from the employee's pay. - evaluation_method: NUMERIC_EXACT - - name: medicare_tax + x-aws-idp-evaluation-method: NUMERIC_EXACT + social_security_tax: + type: string + description: >- + The amount of Social Security tax withheld from the employee's + pay. + x-aws-idp-evaluation-method: NUMERIC_EXACT + medicare_tax: + type: string description: The amount of Medicare tax withheld from the employee's pay. - evaluation_method: NUMERIC_EXACT - description: >- - Federal tax withholding information including income tax, Social Security tax, - Medicare tax, and allocated tips for federal tax reporting. - evaluation_method: LLM - attributeType: group - - name: employee_general_info - groupAttributes: - - name: employee_name_suffix - description: Name suffix of the employee such as Jr., Sr., III, etc. - evaluation_method: EXACT - - name: employee_address - description: The complete residential address of the employee. - evaluation_method: EXACT - - name: employee_last_name + x-aws-idp-evaluation-method: NUMERIC_EXACT + employee_general_info: + type: object + description: >- + Complete personal information about the employee including full name, + address, and Social Security Number for tax identification purposes. + properties: + employee_last_name: + type: string description: The family name or surname of the employee. - evaluation_method: EXACT - - name: employee_zip_code + x-aws-idp-evaluation-method: EXACT + employee_name_suffix: + type: string + description: Name suffix of the employee such as Jr., Sr., III, etc. + x-aws-idp-evaluation-method: EXACT + employee_zip_code: + type: string description: The postal code portion of the employee's address. - evaluation_method: EXACT - - name: first_name + x-aws-idp-evaluation-method: EXACT + employee_address: + type: string + description: The complete residential address of the employee. + x-aws-idp-evaluation-method: EXACT + first_name: + type: string description: The given name of the employee. - evaluation_method: EXACT - - name: ssn + x-aws-idp-evaluation-method: EXACT + ssn: + type: string description: The Social Security Number of the employee. - evaluation_method: EXACT - description: >- - Complete personal information about the employee including full name, address, - and Social Security Number for tax identification purposes. - evaluation_method: LLM - attributeType: group - - name: federal_wage_info - groupAttributes: - - name: social_security_tips + x-aws-idp-evaluation-method: EXACT + federal_wage_info: + type: object + description: >- + Federal wage and compensation information including total wages, tips, + and amounts subject to Social Security and Medicare taxes. + properties: + social_security_tips: + type: string description: Tips subject to Social Security tax reporting. - evaluation_method: NUMERIC_EXACT - - name: wages_tips_other_compensation + x-aws-idp-evaluation-method: NUMERIC_EXACT + wages_tips_other_compensation: + type: string description: Total wages, tips, and other compensation paid to the employee. - evaluation_method: NUMERIC_EXACT - - name: medicare_wages_tips + x-aws-idp-evaluation-method: NUMERIC_EXACT + medicare_wages_tips: + type: string description: Wages and tips subject to Medicare tax. - evaluation_method: NUMERIC_EXACT - - name: social_security_wages + x-aws-idp-evaluation-method: NUMERIC_EXACT + social_security_wages: + type: string description: Wages subject to Social Security tax. - evaluation_method: NUMERIC_EXACT - description: >- - Federal wage and compensation information including total wages, tips, and - amounts subject to Social Security and Medicare taxes. - evaluation_method: LLM - attributeType: group - - name: codes - listItemTemplate: - itemAttributes: - - name: amount - description: The monetary amount associated with this compensation code. - evaluation_method: NUMERIC_EXACT - - name: code - description: The letter code representing the type of compensation or benefit. - evaluation_method: EXACT - itemDescription: Each item represents a specific type of compensation or benefit with its corresponding code - description: >- - Additional compensation codes and amounts representing various types of benefits, - deferred compensation, or other taxable/non-taxable items. - evaluation_method: LLM - attributeType: list - - name: state_taxes_table - listItemTemplate: - itemAttributes: - - name: state_name - description: The name of the state for tax reporting purposes. - evaluation_method: EXACT - - name: local_wages_tips - description: Wages and tips subject to local income tax. - evaluation_method: NUMERIC_EXACT - - name: employer_state_id_number - description: The employer's state identification number for this jurisdiction. - evaluation_method: NUMERIC_EXACT - - name: state_wages_and_tips - description: Wages and tips subject to state income tax. - evaluation_method: NUMERIC_EXACT - - name: state_income_tax - description: State income tax withheld for this jurisdiction. - evaluation_method: NUMERIC_EXACT - - name: local_income_tax - description: Local income tax withheld for this jurisdiction. - evaluation_method: NUMERIC_EXACT - - name: locality_name - description: The name of the local jurisdiction for local tax reporting. - evaluation_method: EXACT - itemDescription: Each item represents state and local tax information for a specific jurisdiction - description: >- - State and local tax information including wages subject to tax, taxes withheld, - and jurisdiction details for state and local tax reporting. - evaluation_method: LLM - attributeType: list - - - name: Homeowners-Insurance-Application + x-aws-idp-evaluation-method: NUMERIC_EXACT + state_taxes_tableItem: + type: object + properties: + state_name: + type: string + description: The name of the state for tax reporting purposes. + x-aws-idp-evaluation-method: EXACT + local_wages_tips: + type: string + description: Wages and tips subject to local income tax. + x-aws-idp-evaluation-method: NUMERIC_EXACT + employer_state_id_number: + type: string + description: The employer's state identification number for this jurisdiction. + x-aws-idp-evaluation-method: NUMERIC_EXACT + state_wages_and_tips: + type: string + description: Wages and tips subject to state income tax. + x-aws-idp-evaluation-method: NUMERIC_EXACT + state_income_tax: + type: string + description: State income tax withheld for this jurisdiction. + x-aws-idp-evaluation-method: NUMERIC_EXACT + local_income_tax: + type: string + description: Local income tax withheld for this jurisdiction. + x-aws-idp-evaluation-method: NUMERIC_EXACT + locality_name: + type: string + description: The name of the local jurisdiction for local tax reporting. + x-aws-idp-evaluation-method: EXACT description: >- - An application form for homeowners insurance coverage containing applicant personal information, - property details, coverage requirements, existing insurance history, and underwriting data - necessary for evaluating risk and determining appropriate coverage terms. - attributes: - - name: Expiration Date - description: >- - The date when the insurance policy expires and requires renewal, indicating when - coverage will terminate if not renewed. - evaluation_method: EXACT - attributeType: simple - - name: Purchase Date and Time - description: >- - The specific date and time when the insurance policy was purchased, including both - date and time components for precise transaction recording. - evaluation_method: EXACT - attributeType: simple - - name: Policy Number - description: >- - The unique identifier assigned to the insurance policy for tracking and reference - purposes throughout the policy lifecycle. - evaluation_method: EXACT - attributeType: simple - - name: Named Insured(s) and Mailing Address - description: >- - The complete name and mailing address of the primary insured party, representing - the policyholder and their contact information. - evaluation_method: EXACT - attributeType: simple - - name: Insurance Company - description: >- - The name and address of the insurance provider issuing the policy, including - complete company contact information. - evaluation_method: EXACT - attributeType: simple - - name: Insured Property - description: >- - The complete address of the property being insured, representing the physical - location covered by the homeowners insurance policy. - evaluation_method: EXACT - attributeType: simple - - name: Primary Phone number - description: >- - The main contact phone number for the policyholder, used for communication - regarding the insurance policy and claims. - evaluation_method: EXACT - attributeType: simple - - name: Effective Date - description: >- - The date when the insurance coverage begins and becomes active, marking the - start of the policy period. - evaluation_method: EXACT - attributeType: simple - - name: Primary Email - description: >- - The main email address for the policyholder, used for electronic communication - regarding policy matters and updates. - evaluation_method: EXACT - attributeType: simple - - name: Alternate Phone number - description: >- - The secondary contact phone number for the policyholder, providing an alternative - method of communication for policy-related matters. - evaluation_method: EXACT - attributeType: simple - - name: Co-Applicant Information - groupAttributes: - - name: Drivers License Number + An annual tax document provided by employers to employees reporting wages + earned and taxes withheld during the tax year for federal and state + income tax filing purposes, containing comprehensive compensation and + withholding information. + type: object + x-aws-idp-document-type: W2 + properties: + employer_info: + description: >- + Complete information about the employing organization including name, + address, tax identification numbers, and control numbers for + processing. + $ref: "#/$defs/employer_info" + x-aws-idp-evaluation-method: LLM + filing_info: + description: >- + Official form identification and verification information including + OMB numbers and validation codes for the W2 document. + $ref: "#/$defs/filing_info" + x-aws-idp-evaluation-method: LLM + codes: + description: >- + Additional compensation codes and amounts representing various types + of benefits, deferred compensation, or other taxable/non-taxable + items. + type: array + x-aws-idp-list-item-description: >- + Each item represents a specific type of compensation or benefit with + its corresponding code + items: + $ref: "#/$defs/code" + x-aws-idp-evaluation-method: LLM + other: + type: string + description: >- + Other compensation or benefits not covered in standard W2 boxes, + representing additional taxable or non-taxable benefits provided to + the employee. + x-aws-idp-evaluation-method: EXACT + federal_tax_info: + description: >- + Federal tax withholding information including income tax, Social + Security tax, Medicare tax, and allocated tips for federal tax + reporting. + $ref: "#/$defs/federal_tax_info" + x-aws-idp-evaluation-method: LLM + state_taxes_table: + description: >- + State and local tax information including wages subject to tax, taxes + withheld, and jurisdiction details for state and local tax reporting. + type: array + x-aws-idp-list-item-description: >- + Each item represents state and local tax information for a specific + jurisdiction + items: + $ref: "#/$defs/state_taxes_tableItem" + x-aws-idp-evaluation-method: LLM + employee_general_info: + description: >- + Complete personal information about the employee including full name, + address, and Social Security Number for tax identification purposes. + $ref: "#/$defs/employee_general_info" + x-aws-idp-evaluation-method: LLM + federal_wage_info: + description: >- + Federal wage and compensation information including total wages, tips, + and amounts subject to Social Security and Medicare taxes. + $ref: "#/$defs/federal_wage_info" + x-aws-idp-evaluation-method: LLM + nonqualified_plans_incom: + type: string + description: >- + Income from nonqualified deferred compensation plans, representing + distributions or benefits from employer-sponsored retirement or + compensation plans. + x-aws-idp-evaluation-method: NUMERIC_EXACT + $id: W2 + - $schema: https://json-schema.org/draft/2020-12/schema + $defs: + Auto Claims, Accidents, and Violations: + type: object + description: >- + Comprehensive history of auto claims, accidents, and traffic + violations for underwriting risk assessment purposes. + properties: + Major: + type: string + description: Information about major auto claims or incidents. + x-aws-idp-evaluation-method: EXACT + Number of Comp Claims: + type: string + description: The number of comprehensive auto insurance claims filed. + x-aws-idp-evaluation-method: EXACT + Number of Violations: + type: string + description: The total number of traffic violations. + x-aws-idp-evaluation-method: EXACT + At-Fault: + type: string + description: Details about at-fault accidents or incidents. + x-aws-idp-evaluation-method: EXACT + Number of Auto Accidents: + type: string + description: The total number of auto accidents. + x-aws-idp-evaluation-method: EXACT + Minor: + type: string + description: Information about minor auto claims or incidents. + x-aws-idp-evaluation-method: EXACT + Not-at-Fault: + type: string + description: Details about not-at-fault accidents or incidents. + x-aws-idp-evaluation-method: EXACT + Co-Applicant Information: + type: object + description: >- + Complete information about the co-applicant including personal + details, driver's license information, insurance history, and + relationship to primary applicant. + properties: + Drivers License Number: + type: string description: The driver's license number of the co-applicant. - evaluation_method: EXACT - - name: Length of Time with Current Auto Carrier - description: Duration of relationship with current auto insurance provider for co-applicant. - evaluation_method: EXACT - - name: DL State + x-aws-idp-evaluation-method: EXACT + Length of Time with Current Auto Carrier: + type: string + description: >- + Duration of relationship with current auto insurance provider for + co-applicant. + x-aws-idp-evaluation-method: EXACT + DL State: + type: string description: The state that issued the co-applicant's driver's license. - evaluation_method: EXACT - - name: Education Level + x-aws-idp-evaluation-method: EXACT + Education Level: + type: string description: The highest level of education completed by the co-applicant. - evaluation_method: EXACT - - name: Currently Insured- Auto + x-aws-idp-evaluation-method: EXACT + Currently Insured- Auto: + type: string description: Current auto insurance carrier for the co-applicant. - evaluation_method: EXACT - - name: Length of Time with Prior Auto Carrier - description: Duration of relationship with previous auto insurance provider for co-applicant. - evaluation_method: EXACT - - name: Date of Birth + x-aws-idp-evaluation-method: EXACT + Length of Time with Prior Auto Carrier: + type: string + description: >- + Duration of relationship with previous auto insurance provider for + co-applicant. + x-aws-idp-evaluation-method: EXACT + Date of Birth: + type: string description: The birth date of the co-applicant. - evaluation_method: EXACT - - name: Gender + x-aws-idp-evaluation-method: EXACT + Gender: + type: string description: The gender of the co-applicant. - evaluation_method: EXACT - - name: Marital Status + x-aws-idp-evaluation-method: EXACT + Marital Status: + type: string description: The marital status of the co-applicant. - evaluation_method: EXACT - - name: Relationship to Primary Applicant + x-aws-idp-evaluation-method: EXACT + Relationship to Primary Applicant: + type: string description: The relationship of the co-applicant to the primary policyholder. - evaluation_method: EXACT - - name: Name + x-aws-idp-evaluation-method: EXACT + Name: + type: string description: The full name of the co-applicant. - evaluation_method: EXACT - description: >- - Complete information about the co-applicant including personal details, driver's - license information, insurance history, and relationship to primary applicant. - evaluation_method: LLM - attributeType: group - - name: Auto Claims, Accidents, and Violations - groupAttributes: - - name: Major - description: Information about major auto claims or incidents. - evaluation_method: EXACT - - name: Number of Comp Claims - description: The number of comprehensive auto insurance claims filed. - evaluation_method: EXACT - - name: Number of Violations - description: The total number of traffic violations. - evaluation_method: EXACT - - name: At-Fault - description: Details about at-fault accidents or incidents. - evaluation_method: EXACT - - name: Number of Auto Accidents - description: The total number of auto accidents. - evaluation_method: EXACT - - name: Minor - description: Information about minor auto claims or incidents. - evaluation_method: EXACT - - name: Not-at-Fault - description: Details about not-at-fault accidents or incidents. - evaluation_method: EXACT - description: >- - Comprehensive history of auto claims, accidents, and traffic violations for - underwriting risk assessment purposes. - evaluation_method: LLM - attributeType: group - - name: Primary Applicant Information - groupAttributes: - - name: Type of Current Property Policy - description: The type of current property insurance policy held by primary applicant. - evaluation_method: EXACT - - name: Drivers License Number + x-aws-idp-evaluation-method: EXACT + Primary Applicant Information: + type: object + description: >- + Complete information about the primary applicant including personal + details, driver's license information, insurance history, and + existing policy details. + properties: + Type of Current Property Policy: + type: string + description: >- + The type of current property insurance policy held by primary + applicant. + x-aws-idp-evaluation-method: EXACT + Drivers License Number: + type: string description: The driver's license number of the primary applicant. - evaluation_method: EXACT - - name: Education Level + x-aws-idp-evaluation-method: EXACT + Education Level: + type: string description: The highest level of education completed by the primary applicant. - evaluation_method: EXACT - - name: Currently Insured Auto + x-aws-idp-evaluation-method: EXACT + Currently Insured Auto: + type: string description: Current auto insurance carrier for the primary applicant. - evaluation_method: EXACT - - name: Length of Time with Prior Auto Carrier - description: Duration of relationship with previous auto insurance provider for primary applicant. - evaluation_method: EXACT - - name: Gender + x-aws-idp-evaluation-method: EXACT + Length of Time with Prior Auto Carrier: + type: string + description: >- + Duration of relationship with previous auto insurance provider for + primary applicant. + x-aws-idp-evaluation-method: EXACT + Gender: + type: string description: The gender of the primary applicant. - evaluation_method: EXACT - - name: Marital Status + x-aws-idp-evaluation-method: EXACT + Marital Status: + type: string description: The marital status of the primary applicant. - evaluation_method: EXACT - - name: Name + x-aws-idp-evaluation-method: EXACT + Name: + type: string description: The full name of the primary applicant. - evaluation_method: EXACT - - name: Length of Time with Current Auto Carrier - description: Duration of relationship with current auto insurance provider for primary applicant. - evaluation_method: EXACT - - name: Existing Esurance Policy - description: Existing insurance policy number or reference for primary applicant. - evaluation_method: EXACT - - name: DL State + x-aws-idp-evaluation-method: EXACT + Length of Time with Current Auto Carrier: + type: string + description: >- + Duration of relationship with current auto insurance provider for + primary applicant. + x-aws-idp-evaluation-method: EXACT + Existing Esurance Policy: + type: string + description: >- + Existing insurance policy number or reference for primary + applicant. + x-aws-idp-evaluation-method: EXACT + DL State: + type: string description: The state that issued the primary applicant's driver's license. - evaluation_method: EXACT - - name: Date of Birth + x-aws-idp-evaluation-method: EXACT + Date of Birth: + type: string description: The birth date of the primary applicant. - evaluation_method: EXACT - - name: Years with Prior Property Company - description: Number of years with previous property insurance company for primary applicant. - evaluation_method: EXACT - description: >- - Complete information about the primary applicant including personal details, - driver's license information, insurance history, and existing policy details. - evaluation_method: LLM - attributeType: group + x-aws-idp-evaluation-method: EXACT + Years with Prior Property Company: + type: string + description: >- + Number of years with previous property insurance company for + primary applicant. + x-aws-idp-evaluation-method: EXACT + description: >- + An application form for homeowners insurance coverage containing applicant + personal information, property details, coverage requirements, existing + insurance history, and underwriting data necessary for evaluating risk + and determining appropriate coverage terms. + type: object + x-aws-idp-document-type: Homeowners-Insurance-Application + properties: + Expiration Date: + type: string + description: >- + The date when the insurance policy expires and requires renewal, + indicating when coverage will terminate if not renewed. + x-aws-idp-evaluation-method: EXACT + Purchase Date and Time: + type: string + description: >- + The specific date and time when the insurance policy was purchased, + including both date and time components for precise transaction + recording. + x-aws-idp-evaluation-method: EXACT + Policy Number: + type: string + description: >- + The unique identifier assigned to the insurance policy for tracking + and reference purposes throughout the policy lifecycle. + x-aws-idp-evaluation-method: EXACT + Alternate Phone number: + type: string + description: >- + The secondary contact phone number for the policyholder, providing an + alternative method of communication for policy-related matters. + x-aws-idp-evaluation-method: EXACT + Named Insured(s) and Mailing Address: + type: string + description: >- + The complete name and mailing address of the primary insured party, + representing the policyholder and their contact information. + x-aws-idp-evaluation-method: EXACT + Insurance Company: + type: string + description: >- + The name and address of the insurance provider issuing the policy, + including complete company contact information. + x-aws-idp-evaluation-method: EXACT + Co-Applicant Information: + description: >- + Complete information about the co-applicant including personal + details, driver's license information, insurance history, and + relationship to primary applicant. + $ref: "#/$defs/Co-Applicant Information" + x-aws-idp-evaluation-method: LLM + Insured Property: + type: string + description: >- + The complete address of the property being insured, representing the + physical location covered by the homeowners insurance policy. + x-aws-idp-evaluation-method: EXACT + Primary Phone number: + type: string + description: >- + The main contact phone number for the policyholder, used for + communication regarding the insurance policy and claims. + x-aws-idp-evaluation-method: EXACT + Auto Claims, Accidents, and Violations: + description: >- + Comprehensive history of auto claims, accidents, and traffic + violations for underwriting risk assessment purposes. + $ref: "#/$defs/Auto Claims, Accidents, and Violations" + x-aws-idp-evaluation-method: LLM + Effective Date: + type: string + description: >- + The date when the insurance coverage begins and becomes active, + marking the start of the policy period. + x-aws-idp-evaluation-method: EXACT + Primary Email: + type: string + description: >- + The main email address for the policyholder, used for electronic + communication regarding policy matters and updates. + x-aws-idp-evaluation-method: EXACT + Primary Applicant Information: + description: >- + Complete information about the primary applicant including personal + details, driver's license information, insurance history, and + existing policy details. + $ref: "#/$defs/Primary Applicant Information" + x-aws-idp-evaluation-method: LLM + $id: Homeowners-Insurance-Application + classification: classificationMethod: multimodalPageLevelClassification maxPagesForClassification: "ALL" @@ -1460,15 +1733,15 @@ agents: 3. Collect relevant logs from CloudWatch 4. Identify any performance issues from X-Ray traces 5. Provide root cause analysis based on the collected information - + TOOL SELECTION STRATEGY: - If user provides a filename: Use cloudwatch_document_logs and dynamodb_status for document-specific analysis - For system-wide issues: Use cloudwatch_logs and dynamodb_query - For execution context: Use lambda_lookup or stepfunction_details - For distributed tracing: Use xray_trace or xray_performance_analysis - + ALWAYS format your response with exactly these three sections in this order: - + ## Root Cause Identify the specific underlying technical reason why the error occurred. Focus on the primary cause, not symptoms. @@ -1477,16 +1750,16 @@ agents:
Evidence - + Format evidence with source information. Include relevant data from tool responses: - + **For CloudWatch logs:** **Log Group:** [full log_group name] **Log Stream:** [full log_stream name] ``` [ERROR] timestamp message ``` - + **For other sources (DynamoDB, Step Functions, X-Ray):** **Source:** [service name and resource] ``` @@ -1508,14 +1781,14 @@ agents: - Use system-wide tools for pattern analysis - Combine DynamoDB status with CloudWatch logs for complete picture - Leverage X-Ray for distributed system issues - + ROOT CAUSE DETERMINATION: 1. Document Status: Check dynamodb_status first 2. Execution Details: Use stepfunction_details for workflow failures 3. Log Analysis: Use cloudwatch_document_logs or cloudwatch_logs for error details 4. Distributed Tracing: Use xray_performance_analysis for service interaction issues 5. Context: Use lambda_lookup for execution environment - + RECOMMENDATION GUIDELINES: For code-related issues or system bugs: - Do not suggest code modifications @@ -1534,7 +1807,7 @@ agents: - last week: 168 hours - last day: 24 hours - No time specified: 24 hours (default) - + IMPORTANT: Do not include any search quality reflections, search quality scores, or meta-analysis sections in your response. Only provide the three required sections: Root Cause, Recommendations, and Evidence. parameters: max_log_events: 5 @@ -1609,13 +1882,13 @@ pricing: - name: bedrock/us.anthropic.claude-haiku-4-5-20251001-v1:0 units: - name: inputTokens - price: '1.1E-06' + price: "1.1E-06" - name: outputTokens - price: '5.5E-06' + price: "5.5E-06" - name: cacheReadInputTokens - price: '1.1E-07' + price: "1.1E-07" - name: cacheWriteInputTokens - price: '1.4E-06' + price: "1.4E-06" - name: bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0 units: - name: inputTokens @@ -1700,89 +1973,89 @@ pricing: - name: bedrock/eu.amazon.nova-lite-v1:0 units: - name: inputTokens - price: '7.8E-8' + price: "7.8E-8" - name: outputTokens - price: '3.1E-7' + price: "3.1E-7" - name: cacheReadInputTokens - price: '1.9E-8' + price: "1.9E-8" - name: cacheWriteInputTokens - price: '7.8E-8' + price: "7.8E-8" - name: bedrock/eu.amazon.nova-pro-v1:0 units: - name: inputTokens - price: '1.0E-6' + price: "1.0E-6" - name: outputTokens - price: '4.2E-6' + price: "4.2E-6" - name: cacheReadInputTokens - price: '2.6E-7' + price: "2.6E-7" - name: cacheWriteInputTokens - price: '1.0E-6' + price: "1.0E-6" - name: bedrock/eu.anthropic.claude-3-haiku-20240307-v1:0 units: - name: inputTokens - price: '2.5E-7' + price: "2.5E-7" - name: outputTokens - price: '1.25E-6' + price: "1.25E-6" - name: bedrock/eu.anthropic.claude-haiku-4-5-20251001-v1:0 units: - name: inputTokens - price: '1.1E-6' + price: "1.1E-6" - name: outputTokens - price: '5.5E-6' + price: "5.5E-6" - name: cacheReadInputTokens - price: '1.1E-7' + price: "1.1E-7" - name: cacheWriteInputTokens - price: '1.4E-6' + price: "1.4E-6" - name: bedrock/eu.anthropic.claude-3-5-sonnet-20241022-v2:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/eu.anthropic.claude-3-7-sonnet-20250219-v1:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/eu.anthropic.claude-sonnet-4-20250514-v1:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/eu.anthropic.claude-sonnet-4-5-20250929-v1:0 units: - name: inputTokens - price: '3.3E-6' + price: "3.3E-6" - name: outputTokens - price: '1.65E-5' + price: "1.65E-5" - name: cacheReadInputTokens - price: '3.3E-7' + price: "3.3E-7" - name: cacheWriteInputTokens - price: '4.125E-6' + price: "4.125E-6" - name: bedrock/eu.anthropic.claude-sonnet-4-5-20250929-v1:0:1m units: - name: inputTokens - price: '6.6E-6' + price: "6.6E-6" - name: outputTokens - price: '2.475E-5' + price: "2.475E-5" - name: cacheReadInputTokens - price: '6.6E-7' + price: "6.6E-7" - name: cacheWriteInputTokens - price: '8.25E-6' + price: "8.25E-6" # AWS Lambda pricing (US East - N. Virginia) - name: lambda/requests units: diff --git a/config_library/pattern-2/rvl-cdip-package-sample-with-few-shot-examples/config.yaml b/config_library/pattern-2/rvl-cdip-package-sample-with-few-shot-examples/config.yaml index d39bb5e5c..67e5264dd 100644 --- a/config_library/pattern-2/rvl-cdip-package-sample-with-few-shot-examples/config.yaml +++ b/config_library/pattern-2/rvl-cdip-package-sample-with-few-shot-examples/config.yaml @@ -3,7 +3,7 @@ notes: Default settings for the rvl-cdip-sample-with-few-shot-examples config ocr: - backend: "textract" # Default to Textract for backward compatibility + backend: "textract" # Default to Textract for backward compatibility model_id: "us.anthropic.claude-3-7-sonnet-20250219-v1:0" system_prompt: "You are an expert OCR system. Extract all text from the provided image accurately, preserving layout where possible." task_prompt: "Extract all text from this document image. Preserve the layout, including paragraphs, tables, and formatting." @@ -12,62 +12,75 @@ ocr: - name: TABLES - name: SIGNATURES image: - dpi: '150' - target_width: '' - target_height: '' + dpi: "150" + target_width: "" + target_height: "" classes: - - name: letter - description: >- - A formal written correspondence with sender/recipient addresses, date, - salutation, body, and closing signature - attributes: - - name: sender_name + - $schema: https://json-schema.org/draft/2020-12/schema + $id: letter + x-aws-idp-document-type: letter + type: object + properties: + sender_name: + type: string description: >- The name of the person or entity who wrote or sent the letter. Look for text following or near terms like 'from', 'sender', 'authored by', 'written by', or at the end of the letter before a signature. - - name: sender_address + sender_address: + type: string description: >- The physical address of the sender, typically appearing at the top of the letter. May be labeled as 'address', 'location', or 'from address'. - - name: recipient_name + recipient_name: + type: string description: >- The name of the person or entity receiving the letter. Look for this after 'to', 'recipient', 'addressee', or at the beginning of the letter. - - name: recipient_address + recipient_address: + type: string description: >- The physical address where the letter is to be delivered. Often labeled as 'to address' or 'delivery address', typically appearing below the recipient name. - - name: date + date: + type: string description: >- The date when the letter was written. Look for a standalone date or text following phrases like 'written on' or 'dated'. - - name: subject + subject: + type: string description: >- The topic or main point of the letter. Often preceded by 'subject', 'RE:', or 'regarding'. - - name: letter_type + letter_type: + type: string description: >- The category or classification of the letter, such as 'complaint', 'inquiry', 'invitation', etc. May be indicated by 'type' or 'category'. - - name: signature + signature: + type: string description: >- The handwritten name or mark of the sender at the end of the letter. May follow terms like 'signed by' or simply appear at the bottom of the document. - - name: cc + cc: + type: string description: >- Names of people who receive a copy of the letter in addition to the main recipient. Often preceded by 'cc', 'carbon copy', or 'copy to'. - - name: reference_number + reference_number: + type: string description: >- An identifying number or code associated with the letter. Look for labels like 'ref', 'reference', or 'our ref'. - examples: + description: >- + A formal written correspondence with sender/recipient addresses, date, + salutation, body, and closing signature + x-aws-idp-examples: - classPrompt: This is an example of the class 'letter' name: Letter1 attributesPrompt: |- @@ -82,7 +95,8 @@ classes: "signature": "Will E. Clark", "cc": null, "reference_number": "TNJB 0008497" - imagePath: config_library/pattern-2/few_shot_example_with_multimodal_page_classification/example-images/letter1.jpg + imagePath: >- + config_library/pattern-2/few_shot_example_with_multimodal_page_classification/example-images/letter1.jpg - classPrompt: This is an example of the class 'letter' name: Letter2 attributesPrompt: |- @@ -97,367 +111,467 @@ classes: "signature": "Bill", "cc": null, "reference_number": null - imagePath: config_library/pattern-2/few_shot_example_with_multimodal_page_classification/example-images/letter2.png - - name: form - description: >- - A structured document with labeled fields, checkboxes, or blanks requiring - user input and completion - attributes: - - name: form_type + imagePath: >- + config_library/pattern-2/few_shot_example_with_multimodal_page_classification/example-images/letter2.png + - $schema: https://json-schema.org/draft/2020-12/schema + $id: form + x-aws-idp-document-type: form + type: object + properties: + form_type: + type: string description: >- The category or purpose of the form, such as 'application', 'registration', 'request', etc. May be identified by 'form name', 'document type', or 'form category'. - - name: form_id + form_id: + type: string description: >- The unique identifier for the form, typically a number or alphanumeric code. Often labeled as 'form number', 'id', or 'reference number'. - - name: submission_date + submission_date: + type: string description: >- The date when the form was submitted or filed. Look for text near 'date', 'submitted on', or 'filed on'. - - name: submitter_name + submitter_name: + type: string description: >- The name of the person who submitted the form. May be labeled as 'name', 'submitted by', or 'filed by'. - - name: submitter_id + submitter_id: + type: string description: >- An identification number for the person submitting the form, such as social security number, employee ID, etc. Often labeled as 'id number', 'identification', or 'reference'. - - name: approval_status + approval_status: + type: string description: >- The current state of approval for the form, such as 'approved', 'pending', 'rejected', etc. Look for terms like 'status', 'approved', or 'pending'. - - name: processed_by + processed_by: + type: string description: >- The name of the person or department that processed the form. May be indicated by 'processor', 'handled by', or 'approved by'. - - name: processing_date + processing_date: + type: string description: >- The date when the form was processed or completed. Look for labels like 'processed on' or 'completion date'. - - name: department + department: + type: string description: >- The organizational unit responsible for the form. Often abbreviated as 'dept' or may appear as 'department' or 'division'. - - name: comments + comments: + type: string description: >- Additional notes or remarks about the form. Look for sections labeled 'notes', 'remarks', or 'comments'. - - name: invoice description: >- - A billing document listing items/services, quantities, prices, payment - terms, and transaction totals - attributes: - - name: invoice_number + A structured document with labeled fields, checkboxes, or blanks requiring + user input and completion + - $schema: https://json-schema.org/draft/2020-12/schema + $id: invoice + x-aws-idp-document-type: invoice + type: object + properties: + invoice_number: + type: string description: The unique identifier for the invoice. Look for 'invoice no', 'invoice - - name: invoice_date + invoice_date: + type: string description: >- The date when the invoice was issued. May be labeled as 'date', 'invoice date', or 'billing date'. - - name: due_date + due_date: + type: string description: >- The deadline by which payment must be made. Look for 'due date', 'payment due', or 'payable by'. - - name: vendor_name + vendor_name: + type: string description: >- The name of the business providing goods or services. May be labeled as 'vendor', 'seller', 'supplier', or simply appear prominently at the top of the invoice. - - name: vendor_address + vendor_address: + type: string description: >- The physical location of the vendor. Look for 'address', 'location', or 'business address', typically near the vendor name. - - name: customer_name + customer_name: + type: string description: >- The name of the person or entity being billed. Often preceded by 'customer', 'buyer', or 'bill to'. - - name: customer_address + customer_address: + type: string description: >- The address where the invoice is sent or goods are delivered. May be labeled as 'billing address' or 'ship to'. - - name: items + items: + type: string description: >- Descriptions of the products or services provided. Look for a section with 'description', 'item details', or 'products', usually in a table format. - - name: quantities + quantities: + type: string description: >- The number of each item provided. Often abbreviated as 'qty' or may appear as 'quantity' or 'amount' in a table column. - - name: unit_prices + unit_prices: + type: string description: >- The cost per unit of each item. May be labeled as 'price', 'rate', or 'unit cost'. - - name: subtotal + subtotal: + type: string description: >- The sum of all items before tax and other charges. Look for 'subtotal' or 'net amount', typically found toward the bottom of the invoice. - - name: tax + tax: + type: string description: >- The amount of tax charged on the invoice. May be labeled as 'tax', 'VAT', or 'GST', usually appearing after the subtotal. - - name: total_amount + total_amount: + type: string description: >- The final amount to be paid including all charges. Look for 'total', 'grand total', or 'amount due', typically the last figure on the invoice. - - name: payment_terms + payment_terms: + type: string description: >- The conditions under which payment should be made, such as '30 days', 'COD', etc. Often labeled as 'terms', 'payment terms', or 'conditions'. - - name: po_number + po_number: + type: string description: >- The purchase order reference number. May be abbreviated as 'PO' or appear as 'purchase order' or 'order reference'. - - name: resume description: >- - A professional summary showcasing work experience, education, skills, and - achievements for job applications - attributes: - - name: full_name + A billing document listing items/services, quantities, prices, payment + terms, and transaction totals + - $schema: https://json-schema.org/draft/2020-12/schema + $id: resume + x-aws-idp-document-type: resume + type: object + properties: + full_name: + type: string description: >- The complete name of the job applicant, typically appearing prominently at the top of the resume. May be simply labeled as 'name' or 'applicant name'. - - name: contact_info + contact_info: + type: string description: >- The phone number, email, and address of the applicant. Look for a section with 'contact', 'phone', 'email', or 'address', usually near the top of the resume. - - name: objective + objective: + type: string description: >- A statement outlining the applicant's career goals. May be labeled as 'objective', 'summary', or 'profile', typically appearing early in the resume. - - name: education + education: + type: string description: >- The academic history and qualifications of the applicant. Look for a section with 'education', 'academic background', or 'qualifications'. - - name: experience + experience: + type: string description: >- The work history and previous roles of the applicant. Often labeled as 'experience', 'work history', or 'employment'. - - name: skills + skills: + type: string description: >- The abilities and competencies of the applicant. Look for a section titled 'skills', 'competencies', or 'expertise'. - - name: certifications + certifications: + type: string description: >- Professional credentials and qualifications. May be labeled as 'certifications', 'certificates', or 'credentials'. - - name: languages + languages: + type: string description: >- Languages known and level of proficiency. Often appears in a section labeled 'languages' or 'language proficiency'. - - name: references + references: + type: string description: >- People who can vouch for the applicant's abilities. Look for 'references' or 'referees', typically at the end of the resume. - - name: achievements + achievements: + type: string description: >- Notable accomplishments and recognition. May be labeled as 'achievements', 'accomplishments', or 'awards'. - - name: scientific_publication description: >- - A peer-reviewed academic document with abstract, methodology, results, - citations, and research findings - attributes: - - name: title + A professional summary showcasing work experience, education, skills, and + achievements for job applications + - $schema: https://json-schema.org/draft/2020-12/schema + $id: scientific_publication + x-aws-idp-document-type: scientific_publication + type: object + properties: + title: + type: string description: >- The name of the scientific paper, typically appearing prominently at the beginning. May be labeled as 'title', 'paper title', or 'article title'. - - name: authors + authors: + type: string description: >- The researchers who conducted the study and wrote the paper. Look for names after 'authors', 'contributors', or 'researchers', usually following the title. - - name: abstract + abstract: + type: string description: >- A brief summary of the paper's content. Often labeled as 'abstract' or 'summary', appearing before the main text. - - name: keywords + keywords: + type: string description: >- Terms that represent the core topics of the paper. Look for a list labeled 'keywords' or 'key terms', typically after the abstract. - - name: publication_date + publication_date: + type: string description: >- The date when the paper was published. May be preceded by 'published' or labeled as 'publication date'. - - name: journal_name + journal_name: + type: string description: >- The name of the journal where the paper was published. Look for text following 'journal' or 'publication'. - - name: volume + volume: + type: string description: >- The volume number of the journal. Often abbreviated as 'vol' or may appear as 'volume'. - - name: issue + issue: + type: string description: >- The issue number of the journal. May be labeled as 'issue' or abbreviated as 'no'. - - name: pages + pages: + type: string description: >- The page range of the paper in the journal. Often abbreviated as 'pp' or may appear as 'pages'. - - name: doi + doi: + type: string description: >- The Digital Object Identifier for the paper, a unique alphanumeric string. Look for 'DOI' or 'digital object identifier'. - - name: funding + funding: + type: string description: >- Financial support received for the research. May be indicated by 'funding', 'grants', or 'financial support'. - - name: corresponding_author + corresponding_author: + type: string description: >- The author responsible for communication regarding the paper. Look for 'corresponding author' or 'contact author'. - - name: institutions + institutions: + type: string description: >- The organizations with which the authors are affiliated. May be labeled as 'affiliations' or 'institutions'. - - name: memo description: >- - An internal business communication with TO/FROM/DATE/SUBJECT headers for - organizational announcements or directives - attributes: - - name: memo_date + A peer-reviewed academic document with abstract, methodology, results, + citations, and research findings + - $schema: https://json-schema.org/draft/2020-12/schema + $id: memo + x-aws-idp-document-type: memo + type: object + properties: + memo_date: + type: string description: >- The date when the memo was written. Look for 'date' or 'memo date', typically near the top of the document. - - name: from + from: + type: string description: >- The person or department that wrote the memo. May be labeled as 'from', 'sender', or 'author'. - - name: to + to: + type: string description: >- The intended recipient of the memo. Look for text after 'to', 'recipient', or 'addressee'. - - name: subject + subject: + type: string description: >- The topic of the memo. Often preceded by 'subject', 'RE:', or 'regarding'. - - name: memo_type + memo_type: + type: string description: >- The category or classification of the memo, such as 'informational', 'directive', etc. May be indicated by 'type' or 'category'. - - name: priority + priority: + type: string description: >- The urgency level of the memo, such as 'urgent', 'high', 'normal', etc. Look for 'priority' or 'urgency'. - - name: distribution_list + distribution_list: + type: string description: >- Additional people who receive copies of the memo. May be labeled as 'distribution', 'cc', or 'copy'. - - name: reference_number + reference_number: + type: string description: >- An identifying number or code for the memo. Look for 'reference' or 'ref no'. - - name: department + department: + type: string description: >- The organizational unit issuing the memo. Often abbreviated as 'dept' or may appear as 'department' or 'division'. - - name: action_required + action_required: + type: string description: >- Steps that should be taken in response to the memo. Look for 'action', 'response needed', or 'next steps'. - - name: advertisement description: >- - A promotional material featuring product images, marketing text, - calls-to-action, and branding elements - attributes: - - name: product_name + An internal business communication with TO/FROM/DATE/SUBJECT headers for + organizational announcements or directives + - $schema: https://json-schema.org/draft/2020-12/schema + $id: advertisement + x-aws-idp-document-type: advertisement + type: object + properties: + product_name: + type: string description: >- The name of the item or service being advertised. Look for prominently displayed text that could be a 'product', 'item', or 'service' name. - - name: brand + brand: + type: string description: >- The company or manufacturer of the product. May be indicated by a logo or text labeled as 'brand', 'company', or 'manufacturer'. - - name: price + price: + type: string description: >- The cost of the product or service. Look for currency symbols or numbers followed by terms like 'price', 'cost', or 'special offer'. - - name: promotion_details + promotion_details: + type: string description: >- Information about special deals or discounts. May be introduced with 'promotion', 'offer', or 'deal'. - - name: validity_period + validity_period: + type: string description: >- The timeframe during which the offer is valid. Look for phrases like 'valid until', 'offer ends', or 'expires'. - - name: contact_info + contact_info: + type: string description: >- How to reach the advertiser. May include phone numbers, websites, or addresses following 'contact', 'call', or 'visit'. - - name: features + features: + type: string description: >- Notable qualities or benefits of the product. Often listed under 'features', 'benefits', or 'highlights'. - - name: terms_conditions + terms_conditions: + type: string description: >- Legal constraints or limitations of the offer. Look for fine print labeled as 'terms', 'conditions', or 'restrictions'. - - name: call_to_action + call_to_action: + type: string description: >- What the advertisement encourages the reader to do. Often appears as imperative phrases like 'call now', 'visit today', or 'order now'. - - name: disclaimer + disclaimer: + type: string description: >- Legal statements limiting liability or making clarifications. Usually appears as fine print introduced by 'disclaimer' or phrases like 'terms apply' or 'conditions apply'. - - name: email description: >- - A digital message with email headers (To/From/Subject), timestamps, and - conversational threading - attributes: - - name: from_address + A promotional material featuring product images, marketing text, + calls-to-action, and branding elements + - $schema: https://json-schema.org/draft/2020-12/schema + $id: email + x-aws-idp-document-type: email + type: object + properties: + from_address: + type: string description: >- The email address of the sender. Look for text following 'from', 'sender', or 'sent by', typically at the beginning of the email header. - - name: to_address + to_address: + type: string description: >- The email address of the primary recipient. May be labeled as 'to', 'recipient', or 'sent to'. - - name: cc_address + cc_address: + type: string description: >- Email addresses of additional recipients who receive copies. Look for 'cc' or 'carbon copy' followed by one or more email addresses. - - name: bcc_address + bcc_address: + type: string description: >- Email addresses of hidden recipients. May be labeled as 'bcc' or 'blind copy'. - - name: subject + subject: + type: string description: >- The topic of the email. Often preceded by 'subject', 'RE:', or 'regarding'. - - name: date_sent + date_sent: + type: string description: >- The date and time when the email was sent. Look for 'date', 'sent on', or 'received', typically in the email header. - - name: attachments + attachments: + type: string description: >- Files included with the email. May be indicated by 'attached', 'attachment', or 'enclosed', often with icons or file names. - - name: priority + priority: + type: string description: >- The urgency level of the email, such as 'high', 'normal', etc. Look for 'priority' or 'importance'. - - name: thread_id + thread_id: + type: string description: >- An identifier for the email conversation. May be labeled as 'thread' or 'conversation', typically not visible to regular users. - - name: message_id + message_id: + type: string description: >- A unique identifier for the specific email. Look for 'message id' or 'email id', usually hidden in the email metadata. - examples: + description: >- + A digital message with email headers (To/From/Subject), timestamps, and + conversational threading + x-aws-idp-examples: - classPrompt: This is an example of the class 'email' name: Email1 attributesPrompt: |- @@ -472,191 +586,238 @@ classes: "priority": null, "thread_id": null, "message_id": null - imagePath: config_library/pattern-2/few_shot_example_with_multimodal_page_classification/example-images/email1.jpg - - name: questionnaire - description: >- - A survey instrument containing numbered questions with multiple choice, - rating scales, or open-ended responses - attributes: - - name: form_title + imagePath: >- + config_library/pattern-2/few_shot_example_with_multimodal_page_classification/example-images/email1.jpg + - $schema: https://json-schema.org/draft/2020-12/schema + $id: questionnaire + x-aws-idp-document-type: questionnaire + type: object + properties: + form_title: + type: string description: >- The name or title of the questionnaire. Look for prominently displayed text at the beginning that could be a 'title', 'survey name', or 'questionnaire name'. - - name: respondent_info + respondent_info: + type: string description: >- Information about the person completing the questionnaire. May include fields labeled 'respondent', 'participant', or 'name'. - - name: submission_date + submission_date: + type: string description: >- The date when the questionnaire was completed. Look for 'date', 'completed on', or 'submitted'. - - name: section_headers + section_headers: + type: string description: >- Titles for different segments of the questionnaire. Often appear as bold or larger text introducing a new 'section', 'part', or 'segment'. - - name: question_types + question_types: + type: string description: >- The format of questions (multiple choice, free text, etc.). May be indicated by 'type', 'question format', or 'response format'. - - name: response_options + response_options: + type: string description: >- Possible answers for multiple-choice questions. Look for checkboxes, radio buttons, or dropdown menus with 'options', 'choices', or 'answers'. - - name: required_fields + required_fields: + type: string description: >- Questions that must be answered to complete the questionnaire. Often marked with an asterisk (*) or explicitly labeled as 'required', 'mandatory', or 'must answer'. - - name: instructions + instructions: + type: string description: >- Guidance on how to complete the questionnaire. Look for text introduced by 'instructions', 'directions', or 'guidelines'. - - name: survey_id + survey_id: + type: string description: >- A unique identifier for the questionnaire. May be labeled as 'survey id', 'reference number', or 'form id'. - - name: completion_status + completion_status: + type: string description: >- Whether the questionnaire has been fully completed. Look for indicators of 'status', 'completion', or 'progress', often shown as a percentage or progress bar. - - name: specification description: >- - A technical document detailing precise requirements, measurements, - standards, and implementation criteria - attributes: - - name: product_name + A survey instrument containing numbered questions with multiple choice, + rating scales, or open-ended responses + - $schema: https://json-schema.org/draft/2020-12/schema + $id: specification + x-aws-idp-document-type: specification + type: object + properties: + product_name: + type: string description: >- The name of the item being specified. Look for text labeled as 'product', 'item', or 'model', typically appearing prominently at the beginning. - - name: version + version: + type: string description: >- The iteration or release number. May be indicated by 'version', 'revision', or 'release', often followed by a number or code. - - name: technical_details + technical_details: + type: string description: >- Specific characteristics and capabilities. Look for sections labeled 'specifications', 'tech specs', or 'details', often presented in a detailed list. - - name: requirements + requirements: + type: string description: >- Necessary conditions or resources. May be introduced with 'requirements', 'prerequisites', or 'needed'. - - name: compatibility + compatibility: + type: string description: >- What the product can work with. Look for text following 'compatible with', 'works with', or 'supports'. - - name: dimensions + dimensions: + type: string description: >- Physical measurements of the product. Often labeled as 'dimensions', 'size', or 'measurements', usually including length, width, height, etc. - - name: materials + materials: + type: string description: >- What the product is made from. May be indicated by 'materials', 'composition', or phrases like 'made from'. - - name: standards + standards: + type: string description: >- Industry guidelines or certifications met. Look for references to 'standards', 'certifications', or 'compliance'. - - name: revision_history + revision_history: + type: string description: >- Record of changes to the specification. Often labeled as 'revisions', 'changes', or 'updates', typically in a table format. - - name: approval_info + approval_info: + type: string description: >- Details about who has validated the specification. May be indicated by phrases like 'approved by', 'certified by', or 'validated'. - - name: generic description: >- - An unstructured document lacking distinctive formatting or - purpose-specific elements of other categories - attributes: - - name: document_type + A technical document detailing precise requirements, measurements, + standards, and implementation criteria + - $schema: https://json-schema.org/draft/2020-12/schema + $id: generic + x-aws-idp-document-type: generic + type: object + properties: + document_type: + type: string description: >- The classification or category of the document. Look for terms like 'type', 'category', or 'class' that indicate what kind of document this is. - - name: document_date + document_date: + type: string description: >- The date when the document was created. May be labeled as 'date', 'created on', or 'issued on'. - - name: document_id + document_id: + type: string description: >- A unique identifier for the document. Look for 'id', 'reference', or 'number', typically appearing near the top of the document. - - name: title + title: + type: string description: >- The name or heading of the document. Often appears prominently at the beginning, may be labeled as 'title', 'heading', or 'subject'. - - name: author + author: + type: string description: >- The person who created the document. Look for 'author', 'creator', or 'sender'. - - name: recipient + recipient: + type: string description: >- The person for whom the document is intended. May be indicated by 'recipient', 'to', or 'addressee'. - - name: content_summary + content_summary: + type: string description: >- A brief description of the document's contents. Look for 'summary', 'abstract', or 'overview', typically appearing early in the document. - - name: status + status: + type: string description: >- The current state of the document, such as 'draft', 'final', 'pending', etc. May be labeled as 'status', 'state', or 'condition'. - - name: department + department: + type: string description: >- The organizational unit associated with the document. Often abbreviated as 'dept' or may appear as 'department' or 'division'. - - name: comments + comments: + type: string description: >- Additional notes or remarks about the document. Look for sections labeled 'notes', 'remarks', or 'comments'. - - name: bank-statement description: >- - A bank statement document containing account information, transactions, - and financial details - attributes: - - name: account_holder_name - description: >- - The name of the account holder. - - name: account_name - description: >- - The name or type of the bank account. - - name: account_number + An unstructured document lacking distinctive formatting or + purpose-specific elements of other categories + - $schema: https://json-schema.org/draft/2020-12/schema + $id: bank-statement + x-aws-idp-document-type: bank-statement + type: object + properties: + account_holder_name: + type: string + description: The name of the account holder. + account_name: + type: string + description: The name or type of the bank account. + account_number: + type: string description: >- The unique identifier for the bank account. Look for text following 'account number', 'account id', or 'account identifier'. - - name: transactions + transactions: + type: string description: >- The list of transactions on the account. Look for text following 'transactions', 'transaction history', or 'transaction details'. - examples: + description: >- + A bank statement document containing account information, transactions, + and financial details + x-aws-idp-examples: - classPrompt: Here are example images for each page of a 3 page 'bank-statement ' name: BankStatement1 - attributesPrompt: |- - Here are example images for each page of a 3 page 'bank-statement. Expected attributes are: + attributesPrompt: >- + Here are example images for each page of a 3 page 'bank-statement. + Expected attributes are: "account_holder_name": "Alejandro Rosalez", "account_name": ["Checking", "Savings"], "account_number": ["003525801543","352580154336"], "transactions": [{"Date": "2/6/2020", "Description": "Food Purchase - AnyCompany Restaurant - 1194989245", "Amount": "-171"}] - imagePath: config_library/pattern-2/few_shot_example_with_multimodal_page_classification/example-images/bank-statement-pages/ + imagePath: >- + config_library/pattern-2/few_shot_example_with_multimodal_page_classification/example-images/bank-statement-pages/ classification: maxPagesForClassification: "ALL" image: - target_height: '' - target_width: '' + target_height: "" + target_width: "" classificationMethod: multimodalPageLevelClassification model: us.amazon.nova-pro-v1:0 - temperature: '0.0' - top_p: '0.1' - top_k: '5' - max_tokens: '4096' + temperature: "0.0" + top_p: "0.1" + top_k: "5" + max_tokens: "4096" system_prompt: >- You are a multimodal document classification expert that analyzes business documents using both visual layout and textual content. Your task is to classify single-page documents into predefined categories based on their structural patterns, visual features, and text content. Your output must be valid JSON according to the requested format. @@ -703,13 +864,13 @@ classification: extraction: image: - target_height: '' - target_width: '' + target_height: "" + target_width: "" model: us.amazon.nova-pro-v1:0 - temperature: '0.0' - top_p: '0.1' - top_k: '5' - max_tokens: '4096' + temperature: "0.0" + top_p: "0.1" + top_k: "5" + max_tokens: "4096" system_prompt: > You are a document assistant. Respond only with JSON. Never make up data, only provide data found in the document being provided. @@ -778,7 +939,7 @@ extraction: {DOCUMENT_TEXT} - + @@ -803,22 +964,22 @@ assessment: enabled: true validation_enabled: false image: - target_height: '' - target_width: '' + target_height: "" + target_width: "" granular: enabled: true max_workers: "20" simple_batch_size: "3" list_batch_size: "1" - default_confidence_threshold: '0.8' - top_p: '0.1' - max_tokens: '10000' - top_k: '5' - temperature: '0.0' + default_confidence_threshold: "0.8" + top_p: "0.1" + max_tokens: "10000" + top_k: "5" + temperature: "0.0" model: us.amazon.nova-lite-v1:0 system_prompt: >- You are a document analysis assessment expert. Your role is to evaluate the confidence and accuracy of data extraction results by analyzing them against source documents. - + Provide accurate confidence scores for each assessment. When bounding boxes are requested, provide precise coordinate locations where information appears in the document. task_prompt: >- @@ -860,7 +1021,7 @@ assessment: For each field, provide bounding box coordinates: - bbox: [x1, y1, x2, y2] coordinates in normalized 0-1000 scale - page: Page number where the field appears (starting from 1) - + Coordinate system: - Use normalized scale 0-1000 for both x and y axes - x1, y1 = top-left corner of bounding box @@ -959,9 +1120,9 @@ assessment: evaluation: enabled: true llm_method: - top_p: '0.1' - max_tokens: '4096' - top_k: '5' + top_p: "0.1" + max_tokens: "4096" + top_k: "5" task_prompt: > I need to evaluate attribute extraction for a document of class: {DOCUMENT_CLASS}. @@ -995,7 +1156,7 @@ evaluation: "score": 0.0 to 1.0, "reason": "Your explanation here" } - temperature: '0.0' + temperature: "0.0" model: us.anthropic.claude-3-haiku-20240307-v1:0 system_prompt: > You are an evaluator that helps determine if the predicted and expected @@ -1003,9 +1164,9 @@ evaluation: context and meaning rather than just exact string matching. summarization: enabled: true - top_p: '0.1' - max_tokens: '4096' - top_k: '5' + top_p: "0.1" + max_tokens: "4096" + top_k: "5" task_prompt: > Analyze the provided document and create a comprehensive summary. @@ -1065,7 +1226,7 @@ summarization: {DOCUMENT_TEXT} - temperature: '0.0' + temperature: "0.0" model: us.anthropic.claude-3-7-sonnet-20250219-v1:0 system_prompt: > You are a document summarization expert who can analyze and summarize @@ -1125,8 +1286,8 @@ discovery: ] } with_ground_truth: - top_p: '0.1' - temperature: '1.0' + top_p: "0.1" + temperature: "1.0" user_prompt: >- This image contains unstructured data. Analyze the data line by line using the provided ground truth as reference. @@ -1144,17 +1305,17 @@ discovery: Do not extract the values. Format the extracted data using the below JSON format: Format the extracted groups and fields using the below JSON format: - + model_id: us.amazon.nova-pro-v1:0 system_prompt: >- You are an expert in processing forms. Extracting data from images and documents. Use provided ground truth data as reference to optimize field extraction and ensure consistency with expected document structure and field definitions. - max_tokens: '10000' + max_tokens: "10000" without_ground_truth: - top_p: '0.1' - temperature: '1.0' + top_p: "0.1" + temperature: "1.0" user_prompt: >- This image contains forms data. Analyze the form line by line. Image may contains multiple pages, process all the pages. @@ -1180,7 +1341,7 @@ discovery: documents. Analyze forms line by line to identify field names, data types, and organizational structure. Focus on creating comprehensive blueprints for document processing without extracting actual values. - max_tokens: '10000' + max_tokens: "10000" agents: error_analyzer: model_id: us.anthropic.claude-sonnet-4-20250514-v1:0 @@ -1194,15 +1355,15 @@ agents: 3. Collect relevant logs from CloudWatch 4. Identify any performance issues from X-Ray traces 5. Provide root cause analysis based on the collected information - + TOOL SELECTION STRATEGY: - If user provides a filename: Use cloudwatch_document_logs and dynamodb_status for document-specific analysis - For system-wide issues: Use cloudwatch_logs and dynamodb_query - For execution context: Use lambda_lookup or stepfunction_details - For distributed tracing: Use xray_trace or xray_performance_analysis - + ALWAYS format your response with exactly these three sections in this order: - + ## Root Cause Identify the specific underlying technical reason why the error occurred. Focus on the primary cause, not symptoms. @@ -1211,16 +1372,16 @@ agents:
Evidence - + Format evidence with source information. Include relevant data from tool responses: - + **For CloudWatch logs:** **Log Group:** [full log_group name] **Log Stream:** [full log_stream name] ``` [ERROR] timestamp message ``` - + **For other sources (DynamoDB, Step Functions, X-Ray):** **Source:** [service name and resource] ``` @@ -1242,14 +1403,14 @@ agents: - Use system-wide tools for pattern analysis - Combine DynamoDB status with CloudWatch logs for complete picture - Leverage X-Ray for distributed system issues - + ROOT CAUSE DETERMINATION: 1. Document Status: Check dynamodb_status first 2. Execution Details: Use stepfunction_details for workflow failures 3. Log Analysis: Use cloudwatch_document_logs or cloudwatch_logs for error details 4. Distributed Tracing: Use xray_performance_analysis for service interaction issues 5. Context: Use lambda_lookup for execution environment - + RECOMMENDATION GUIDELINES: For code-related issues or system bugs: - Do not suggest code modifications @@ -1268,7 +1429,7 @@ agents: - last week: 168 hours - last day: 24 hours - No time specified: 24 hours (default) - + IMPORTANT: Do not include any search quality reflections, search quality scores, or meta-analysis sections in your response. Only provide the three required sections: Root Cause, Recommendations, and Evidence. parameters: max_log_events: 5 @@ -1277,252 +1438,252 @@ pricing: - name: textract/detect_document_text units: - name: pages - price: '0.0015' + price: "0.0015" - name: textract/analyze_document-Layout units: - name: pages - price: '0.004' + price: "0.004" - name: textract/analyze_document-Signatures units: - name: pages - price: '0.0035' + price: "0.0035" - name: textract/analyze_document-Forms units: - name: pages - price: '0.05' + price: "0.05" - name: textract/analyze_document-Tables units: - name: pages - price: '0.015' + price: "0.015" - name: textract/analyze_document-Tables+Forms units: - name: pages - price: '0.065' + price: "0.065" - name: bedrock/us.amazon.nova-lite-v1:0 units: - name: inputTokens - price: '6.0E-8' + price: "6.0E-8" - name: outputTokens - price: '2.4E-7' + price: "2.4E-7" - name: cacheReadInputTokens - price: '1.5E-8' + price: "1.5E-8" - name: cacheWriteInputTokens - price: '6.0E-8' + price: "6.0E-8" - name: bedrock/us.amazon.nova-pro-v1:0 units: - name: inputTokens - price: '8.0E-7' + price: "8.0E-7" - name: outputTokens - price: '3.2E-6' + price: "3.2E-6" - name: cacheReadInputTokens - price: '2.0E-7' + price: "2.0E-7" - name: cacheWriteInputTokens - price: '8.0E-7' + price: "8.0E-7" - name: bedrock/us.amazon.nova-premier-v1:0 units: - name: inputTokens - price: '2.5E-6' + price: "2.5E-6" - name: outputTokens - price: '1.25E-5' + price: "1.25E-5" - name: bedrock/us.anthropic.claude-3-haiku-20240307-v1:0 units: - name: inputTokens - price: '2.5E-7' + price: "2.5E-7" - name: outputTokens - price: '1.25E-6' + price: "1.25E-6" - name: bedrock/us.anthropic.claude-3-5-haiku-20241022-v1:0 units: - name: inputTokens - price: '8.0E-7' + price: "8.0E-7" - name: outputTokens - price: '4.0E-6' + price: "4.0E-6" - name: cacheReadInputTokens - price: '8.0E-8' + price: "8.0E-8" - name: cacheWriteInputTokens - price: '1.0E-6' + price: "1.0E-6" - name: bedrock/us.anthropic.claude-haiku-4-5-20251001-v1:0 units: - name: inputTokens - price: '1.1E-06' + price: "1.1E-06" - name: outputTokens - price: '5.5E-06' + price: "5.5E-06" - name: cacheReadInputTokens - price: '1.1E-07' + price: "1.1E-07" - name: cacheWriteInputTokens - price: '1.4E-06' + price: "1.4E-06" - name: bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0:1m units: - name: inputTokens - price: '6.0E-6' + price: "6.0E-6" - name: outputTokens - price: '2.25E-5' + price: "2.25E-5" - name: cacheReadInputTokens - price: '6.0E-7' + price: "6.0E-7" - name: cacheWriteInputTokens - price: '7.5E-6' + price: "7.5E-6" - name: bedrock/us.anthropic.claude-sonnet-4-5-20250929-v1:0 units: - name: inputTokens - price: '3.3E-6' + price: "3.3E-6" - name: outputTokens - price: '1.65E-5' + price: "1.65E-5" - name: cacheReadInputTokens - price: '3.3E-7' + price: "3.3E-7" - name: cacheWriteInputTokens - price: '4.125E-6' + price: "4.125E-6" - name: bedrock/us.anthropic.claude-sonnet-4-5-20250929-v1:0:1m units: - name: inputTokens - price: '6.6E-6' + price: "6.6E-6" - name: outputTokens - price: '2.475E-5' + price: "2.475E-5" - name: cacheReadInputTokens - price: '6.6E-7' + price: "6.6E-7" - name: cacheWriteInputTokens - price: '8.25E-6' + price: "8.25E-6" - name: bedrock/us.anthropic.claude-opus-4-20250514-v1:0 units: - name: inputTokens - price: '1.5E-5' + price: "1.5E-5" - name: outputTokens - price: '7.5E-5' + price: "7.5E-5" - name: cacheReadInputTokens - price: '1.5E-6' + price: "1.5E-6" - name: cacheWriteInputTokens - price: '1.875E-5' + price: "1.875E-5" - name: bedrock/us.anthropic.claude-opus-4-1-20250805-v1:0 units: - name: inputTokens - price: '1.5E-5' + price: "1.5E-5" - name: outputTokens - price: '7.5E-5' + price: "7.5E-5" - name: cacheReadInputTokens - price: '1.5E-6' + price: "1.5E-6" - name: cacheWriteInputTokens - price: '1.875E-5' + price: "1.875E-5" # EU model pricing - name: bedrock/eu.amazon.nova-lite-v1:0 units: - name: inputTokens - price: '7.8E-8' + price: "7.8E-8" - name: outputTokens - price: '3.1E-7' + price: "3.1E-7" - name: cacheReadInputTokens - price: '1.9E-8' + price: "1.9E-8" - name: cacheWriteInputTokens - price: '7.8E-8' + price: "7.8E-8" - name: bedrock/eu.amazon.nova-pro-v1:0 units: - name: inputTokens - price: '1.0E-6' + price: "1.0E-6" - name: outputTokens - price: '4.2E-6' + price: "4.2E-6" - name: cacheReadInputTokens - price: '2.6E-7' + price: "2.6E-7" - name: cacheWriteInputTokens - price: '1.0E-6' + price: "1.0E-6" - name: bedrock/eu.anthropic.claude-3-haiku-20240307-v1:0 units: - name: inputTokens - price: '2.5E-7' + price: "2.5E-7" - name: outputTokens - price: '1.25E-6' + price: "1.25E-6" - name: bedrock/eu.anthropic.claude-haiku-4-5-20251001-v1:0 units: - name: inputTokens - price: '1.1E-6' + price: "1.1E-6" - name: outputTokens - price: '5.5E-6' + price: "5.5E-6" - name: cacheReadInputTokens - price: '1.1E-7' + price: "1.1E-7" - name: cacheWriteInputTokens - price: '1.4E-6' + price: "1.4E-6" - name: bedrock/eu.anthropic.claude-3-5-sonnet-20241022-v2:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/eu.anthropic.claude-3-7-sonnet-20250219-v1:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/eu.anthropic.claude-sonnet-4-20250514-v1:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/eu.anthropic.claude-sonnet-4-5-20250929-v1:0 units: - name: inputTokens - price: '3.3E-6' + price: "3.3E-6" - name: outputTokens - price: '1.65E-5' + price: "1.65E-5" - name: cacheReadInputTokens - price: '3.3E-7' + price: "3.3E-7" - name: cacheWriteInputTokens - price: '4.125E-6' + price: "4.125E-6" - name: bedrock/eu.anthropic.claude-sonnet-4-5-20250929-v1:0:1m units: - name: inputTokens - price: '6.6E-6' + price: "6.6E-6" - name: outputTokens - price: '2.475E-5' + price: "2.475E-5" - name: cacheReadInputTokens - price: '6.6E-7' + price: "6.6E-7" - name: cacheWriteInputTokens - price: '8.25E-6' + price: "8.25E-6" # AWS Lambda pricing (US East - N. Virginia) - name: lambda/requests units: - name: invocations - price: '2.0E-7' # $0.0000002 per request ($0.20 per 1M requests) - - name: lambda/duration + price: "2.0E-7" # $0.0000002 per request ($0.20 per 1M requests) + - name: lambda/duration units: - name: gb_seconds - price: '1.66667E-5' # $0.0000166667 per GB-second ($16.67 per 1M GB-seconds) + price: "1.66667E-5" # $0.0000166667 per GB-second ($16.67 per 1M GB-seconds) diff --git a/config_library/pattern-2/rvl-cdip-package-sample/config.yaml b/config_library/pattern-2/rvl-cdip-package-sample/config.yaml index 375cba4c6..3aed2fcc7 100644 --- a/config_library/pattern-2/rvl-cdip-package-sample/config.yaml +++ b/config_library/pattern-2/rvl-cdip-package-sample/config.yaml @@ -2,7 +2,7 @@ # SPDX-License-Identifier: MIT-0 notes: Default settings for rvil_cdip configuration ocr: - backend: "textract" # Default to Textract for backward compatibility + backend: "textract" # Default to Textract for backward compatibility model_id: "us.anthropic.claude-3-7-sonnet-20250219-v1:0" system_prompt: "You are an expert OCR system. Extract all text from the provided image accurately, preserving layout where possible." task_prompt: "Extract all text from this document image. Preserve the layout, including paragraphs, tables, and formatting." @@ -11,309 +11,765 @@ ocr: - name: TABLES - name: SIGNATURES image: - dpi: '150' - target_width: '' - target_height: '' + dpi: "150" + target_width: "" + target_height: "" classes: - - name: letter - description: A formal written correspondence with sender/recipient addresses, date, salutation, body, and closing signature - attributes: - - name: sender_name - description: The name of the person or entity who wrote or sent the letter. Look for text following or near terms like 'from', 'sender', 'authored by', 'written by', or at the end of the letter before a signature. - confidence_threshold: '0.85' - - name: sender_address - description: The physical address of the sender, typically appearing at the top of the letter. May be labeled as 'address', 'location', or 'from address'. - confidence_threshold: '0.8' - - name: recipient_name - description: The name of the person or entity receiving the letter. Look for this after 'to', 'recipient', 'addressee', or at the beginning of the letter. - confidence_threshold: '0.9' - - name: recipient_address - description: The physical address where the letter is to be delivered. Often labeled as 'to address' or 'delivery address', typically appearing below the recipient name. - - name: date - description: The date when the letter was written. Look for a standalone date or text following phrases like 'written on' or 'dated'. - - name: subject - description: The topic or main point of the letter. Often preceded by 'subject', 'RE:', or 'regarding'. - - name: letter_type - description: The category or classification of the letter, such as 'complaint', 'inquiry', 'invitation', etc. May be indicated by 'type' or 'category'. - - name: signature - description: The handwritten name or mark of the sender at the end of the letter. May follow terms like 'signed by' or simply appear at the bottom of the document. - - name: cc - description: Names of people who receive a copy of the letter in addition to the main recipient. Often preceded by 'cc', 'carbon copy', or 'copy to'. - - name: reference_number - description: An identifying number or code associated with the letter. Look for labels like 'ref', 'reference', or 'our ref'. - - name: form - description: A structured document with labeled fields, checkboxes, or blanks requiring user input and completion - attributes: - - name: form_type - description: The category or purpose of the form, such as 'application', 'registration', 'request', etc. May be identified by 'form name', 'document type', or 'form category'. - - name: form_id - description: The unique identifier for the form, typically a number or alphanumeric code. Often labeled as 'form number', 'id', or 'reference number'. - - name: submission_date - description: The date when the form was submitted or filed. Look for text near 'date', 'submitted on', or 'filed on'. - - name: submitter_name - description: The name of the person who submitted the form. May be labeled as 'name', 'submitted by', or 'filed by'. - - name: submitter_id - description: An identification number for the person submitting the form, such as social security number, employee ID, etc. Often labeled as 'id number', 'identification', or 'reference'. - - name: approval_status - description: The current state of approval for the form, such as 'approved', 'pending', 'rejected', etc. Look for terms like 'status', 'approved', or 'pending'. - - name: processed_by - description: The name of the person or department that processed the form. May be indicated by 'processor', 'handled by', or 'approved by'. - - name: processing_date - description: The date when the form was processed or completed. Look for labels like 'processed on' or 'completion date'. - - name: department - description: The organizational unit responsible for the form. Often abbreviated as 'dept' or may appear as 'department' or 'division'. - - name: comments - description: Additional notes or remarks about the form. Look for sections labeled 'notes', 'remarks', or 'comments'. - - name: invoice - description: A billing document listing items/services, quantities, prices, payment terms, and transaction totals - attributes: - - name: invoice_number + - $schema: https://json-schema.org/draft/2020-12/schema + $id: letter + x-aws-idp-document-type: letter + type: object + description: >- + A formal written correspondence with sender/recipient addresses, date, + salutation, body, and closing signature + properties: + sender_name: + type: string + description: >- + The name of the person or entity who wrote or sent the letter. Look + for text following or near terms like 'from', 'sender', 'authored by', + 'written by', or at the end of the letter before a signature. + x-aws-idp-confidence-threshold: 0.85 + sender_address: + type: string + description: >- + The physical address of the sender, typically appearing at the top of + the letter. May be labeled as 'address', 'location', or 'from + address'. + x-aws-idp-confidence-threshold: 0.8 + recipient_name: + type: string + description: >- + The name of the person or entity receiving the letter. Look for this + after 'to', 'recipient', 'addressee', or at the beginning of the + letter. + x-aws-idp-confidence-threshold: 0.9 + recipient_address: + type: string + description: >- + The physical address where the letter is to be delivered. Often + labeled as 'to address' or 'delivery address', typically appearing + below the recipient name. + date: + type: string + description: >- + The date when the letter was written. Look for a standalone date or + text following phrases like 'written on' or 'dated'. + subject: + type: string + description: >- + The topic or main point of the letter. Often preceded by 'subject', + 'RE:', or 'regarding'. + letter_type: + type: string + description: >- + The category or classification of the letter, such as 'complaint', + 'inquiry', 'invitation', etc. May be indicated by 'type' or + 'category'. + signature: + type: string + description: >- + The handwritten name or mark of the sender at the end of the letter. + May follow terms like 'signed by' or simply appear at the bottom of + the document. + cc: + type: string + description: >- + Names of people who receive a copy of the letter in addition to the + main recipient. Often preceded by 'cc', 'carbon copy', or 'copy to'. + reference_number: + type: string + description: >- + An identifying number or code associated with the letter. Look for + labels like 'ref', 'reference', or 'our ref'. + - $schema: https://json-schema.org/draft/2020-12/schema + $id: form + x-aws-idp-document-type: form + type: object + description: >- + A structured document with labeled fields, checkboxes, or blanks requiring + user input and completion + properties: + form_type: + type: string + description: >- + The category or purpose of the form, such as 'application', + 'registration', 'request', etc. May be identified by 'form name', + 'document type', or 'form category'. + form_id: + type: string + description: >- + The unique identifier for the form, typically a number or alphanumeric + code. Often labeled as 'form number', 'id', or 'reference number'. + submission_date: + type: string + description: >- + The date when the form was submitted or filed. Look for text near + 'date', 'submitted on', or 'filed on'. + submitter_name: + type: string + description: >- + The name of the person who submitted the form. May be labeled as + 'name', 'submitted by', or 'filed by'. + submitter_id: + type: string + description: >- + An identification number for the person submitting the form, such as + social security number, employee ID, etc. Often labeled as 'id + number', 'identification', or 'reference'. + approval_status: + type: string + description: >- + The current state of approval for the form, such as 'approved', + 'pending', 'rejected', etc. Look for terms like 'status', 'approved', + or 'pending'. + processed_by: + type: string + description: >- + The name of the person or department that processed the form. May be + indicated by 'processor', 'handled by', or 'approved by'. + processing_date: + type: string + description: >- + The date when the form was processed or completed. Look for labels + like 'processed on' or 'completion date'. + department: + type: string + description: >- + The organizational unit responsible for the form. Often abbreviated as + 'dept' or may appear as 'department' or 'division'. + comments: + type: string + description: >- + Additional notes or remarks about the form. Look for sections labeled + 'notes', 'remarks', or 'comments'. + - $schema: https://json-schema.org/draft/2020-12/schema + $id: invoice + x-aws-idp-document-type: invoice + type: object + description: >- + A billing document listing items/services, quantities, prices, payment + terms, and transaction totals + properties: + invoice_number: + type: string description: The unique identifier for the invoice. Look for 'invoice no', 'invoice - - name: invoice_date - description: The date when the invoice was issued. May be labeled as 'date', 'invoice date', or 'billing date'. - - name: due_date - description: The deadline by which payment must be made. Look for 'due date', 'payment due', or 'payable by'. - - name: vendor_name - description: The name of the business providing goods or services. May be labeled as 'vendor', 'seller', 'supplier', or simply appear prominently at the top of the invoice. - - name: vendor_address - description: The physical location of the vendor. Look for 'address', 'location', or 'business address', typically near the vendor name. - - name: customer_name - description: The name of the person or entity being billed. Often preceded by 'customer', 'buyer', or 'bill to'. - - name: customer_address - description: The address where the invoice is sent or goods are delivered. May be labeled as 'billing address' or 'ship to'. - - name: items - description: Descriptions of the products or services provided. Look for a section with 'description', 'item details', or 'products', usually in a table format. - - name: quantities - description: The number of each item provided. Often abbreviated as 'qty' or may appear as 'quantity' or 'amount' in a table column. - - name: unit_prices - description: The cost per unit of each item. May be labeled as 'price', 'rate', or 'unit cost'. - - name: subtotal - description: The sum of all items before tax and other charges. Look for 'subtotal' or 'net amount', typically found toward the bottom of the invoice. - - name: tax - description: The amount of tax charged on the invoice. May be labeled as 'tax', 'VAT', or 'GST', usually appearing after the subtotal. - - name: total_amount - description: The final amount to be paid including all charges. Look for 'total', 'grand total', or 'amount due', typically the last figure on the invoice. - - name: payment_terms - description: The conditions under which payment should be made, such as '30 days', 'COD', etc. Often labeled as 'terms', 'payment terms', or 'conditions'. - - name: po_number - description: The purchase order reference number. May be abbreviated as 'PO' or appear as 'purchase order' or 'order reference'. - - name: resume - description: A professional summary showcasing work experience, education, skills, and achievements for job applications - attributes: - - name: full_name - description: The complete name of the job applicant, typically appearing prominently at the top of the resume. May be simply labeled as 'name' or 'applicant name'. - - name: contact_info - description: The phone number, email, and address of the applicant. Look for a section with 'contact', 'phone', 'email', or 'address', usually near the top of the resume. - - name: objective - description: A statement outlining the applicant's career goals. May be labeled as 'objective', 'summary', or 'profile', typically appearing early in the resume. - - name: education - description: The academic history and qualifications of the applicant. Look for a section with 'education', 'academic background', or 'qualifications'. - - name: experience - description: The work history and previous roles of the applicant. Often labeled as 'experience', 'work history', or 'employment'. - - name: skills - description: The abilities and competencies of the applicant. Look for a section titled 'skills', 'competencies', or 'expertise'. - - name: certifications - description: Professional credentials and qualifications. May be labeled as 'certifications', 'certificates', or 'credentials'. - - name: languages - description: Languages known and level of proficiency. Often appears in a section labeled 'languages' or 'language proficiency'. - - name: references - description: People who can vouch for the applicant's abilities. Look for 'references' or 'referees', typically at the end of the resume. - - name: achievements - description: Notable accomplishments and recognition. May be labeled as 'achievements', 'accomplishments', or 'awards'. - - name: scientific_publication - description: A peer-reviewed academic document with abstract, methodology, results, citations, and research findings - attributes: - - name: title - description: The name of the scientific paper, typically appearing prominently at the beginning. May be labeled as 'title', 'paper title', or 'article title'. - - name: authors - description: The researchers who conducted the study and wrote the paper. Look for names after 'authors', 'contributors', or 'researchers', usually following the title. - - name: abstract - description: A brief summary of the paper's content. Often labeled as 'abstract' or 'summary', appearing before the main text. - - name: keywords - description: Terms that represent the core topics of the paper. Look for a list labeled 'keywords' or 'key terms', typically after the abstract. - - name: publication_date - description: The date when the paper was published. May be preceded by 'published' or labeled as 'publication date'. - - name: journal_name - description: The name of the journal where the paper was published. Look for text following 'journal' or 'publication'. - - name: volume - description: The volume number of the journal. Often abbreviated as 'vol' or may appear as 'volume'. - - name: issue - description: The issue number of the journal. May be labeled as 'issue' or abbreviated as 'no'. - - name: pages - description: The page range of the paper in the journal. Often abbreviated as 'pp' or may appear as 'pages'. - - name: doi - description: The Digital Object Identifier for the paper, a unique alphanumeric string. Look for 'DOI' or 'digital object identifier'. - - name: funding - description: Financial support received for the research. May be indicated by 'funding', 'grants', or 'financial support'. - - name: corresponding_author - description: The author responsible for communication regarding the paper. Look for 'corresponding author' or 'contact author'. - - name: institutions - description: The organizations with which the authors are affiliated. May be labeled as 'affiliations' or 'institutions'. - - name: memo - description: An internal business communication with TO/FROM/DATE/SUBJECT headers for organizational announcements or directives - attributes: - - name: memo_date - description: The date when the memo was written. Look for 'date' or 'memo date', typically near the top of the document. - - name: from - description: The person or department that wrote the memo. May be labeled as 'from', 'sender', or 'author'. - - name: to - description: The intended recipient of the memo. Look for text after 'to', 'recipient', or 'addressee'. - - name: subject - description: The topic of the memo. Often preceded by 'subject', 'RE:', or 'regarding'. - - name: memo_type - description: The category or classification of the memo, such as 'informational', 'directive', etc. May be indicated by 'type' or 'category'. - - name: priority - description: The urgency level of the memo, such as 'urgent', 'high', 'normal', etc. Look for 'priority' or 'urgency'. - - name: distribution_list - description: Additional people who receive copies of the memo. May be labeled as 'distribution', 'cc', or 'copy'. - - name: reference_number - description: An identifying number or code for the memo. Look for 'reference' or 'ref no'. - - name: department - description: The organizational unit issuing the memo. Often abbreviated as 'dept' or may appear as 'department' or 'division'. - - name: action_required - description: Steps that should be taken in response to the memo. Look for 'action', 'response needed', or 'next steps'. - - name: advertisement - description: A promotional material featuring product images, marketing text, calls-to-action, and branding elements - attributes: - - name: product_name - description: The name of the item or service being advertised. Look for prominently displayed text that could be a 'product', 'item', or 'service' name. - - name: brand - description: The company or manufacturer of the product. May be indicated by a logo or text labeled as 'brand', 'company', or 'manufacturer'. - - name: price - description: The cost of the product or service. Look for currency symbols or numbers followed by terms like 'price', 'cost', or 'special offer'. - - name: promotion_details - description: Information about special deals or discounts. May be introduced with 'promotion', 'offer', or 'deal'. - - name: validity_period - description: The timeframe during which the offer is valid. Look for phrases like 'valid until', 'offer ends', or 'expires'. - - name: contact_info - description: How to reach the advertiser. May include phone numbers, websites, or addresses following 'contact', 'call', or 'visit'. - - name: features - description: Notable qualities or benefits of the product. Often listed under 'features', 'benefits', or 'highlights'. - - name: terms_conditions - description: Legal constraints or limitations of the offer. Look for fine print labeled as 'terms', 'conditions', or 'restrictions'. - - name: call_to_action - description: What the advertisement encourages the reader to do. Often appears as imperative phrases like 'call now', 'visit today', or 'order now'. - - name: disclaimer - description: Legal statements limiting liability or making clarifications. Usually appears as fine print introduced by 'disclaimer' or phrases like 'terms apply' or 'conditions apply'. - - name: email - description: A digital message with email headers (To/From/Subject), timestamps, and conversational threading - attributes: - - name: from_address - description: The email address of the sender. Look for text following 'from', 'sender', or 'sent by', typically at the beginning of the email header. - - name: to_address - description: The email address of the primary recipient. May be labeled as 'to', 'recipient', or 'sent to'. - - name: cc_address - description: Email addresses of additional recipients who receive copies. Look for 'cc' or 'carbon copy' followed by one or more email addresses. - - name: bcc_address - description: Email addresses of hidden recipients. May be labeled as 'bcc' or 'blind copy'. - - name: subject - description: The topic of the email. Often preceded by 'subject', 'RE:', or 'regarding'. - - name: date_sent - description: The date and time when the email was sent. Look for 'date', 'sent on', or 'received', typically in the email header. - - name: attachments - description: Files included with the email. May be indicated by 'attached', 'attachment', or 'enclosed', often with icons or file names. - - name: priority - description: The urgency level of the email, such as 'high', 'normal', etc. Look for 'priority' or 'importance'. - - name: thread_id - description: An identifier for the email conversation. May be labeled as 'thread' or 'conversation', typically not visible to regular users. - - name: message_id - description: A unique identifier for the specific email. Look for 'message id' or 'email id', usually hidden in the email metadata. - - name: questionnaire - description: A survey instrument containing numbered questions with multiple choice, rating scales, or open-ended responses - attributes: - - name: form_title - description: The name or title of the questionnaire. Look for prominently displayed text at the beginning that could be a 'title', 'survey name', or 'questionnaire name'. - - name: respondent_info - description: Information about the person completing the questionnaire. May include fields labeled 'respondent', 'participant', or 'name'. - - name: submission_date - description: The date when the questionnaire was completed. Look for 'date', 'completed on', or 'submitted'. - - name: section_headers - description: Titles for different segments of the questionnaire. Often appear as bold or larger text introducing a new 'section', 'part', or 'segment'. - - name: question_types - description: The format of questions (multiple choice, free text, etc.). May be indicated by 'type', 'question format', or 'response format'. - - name: response_options - description: Possible answers for multiple-choice questions. Look for checkboxes, radio buttons, or dropdown menus with 'options', 'choices', or 'answers'. - - name: required_fields - description: Questions that must be answered to complete the questionnaire. Often marked with an asterisk (*) or explicitly labeled as 'required', 'mandatory', or 'must answer'. - - name: instructions - description: Guidance on how to complete the questionnaire. Look for text introduced by 'instructions', 'directions', or 'guidelines'. - - name: survey_id - description: A unique identifier for the questionnaire. May be labeled as 'survey id', 'reference number', or 'form id'. - - name: completion_status - description: Whether the questionnaire has been fully completed. Look for indicators of 'status', 'completion', or 'progress', often shown as a percentage or progress bar. - - name: Phone Call Representative Courtesy - description: Measures the perceived politeness and professionalism of the customer service representative during the phone interaction. Checkbox selection on the satisfaction scale with options like "Very Satisfied", "Somewhat Satisfied" etc. - attributes: [] - evaluation_method: '' - - name: Phone call representative knowledge rating - description: Measures the perceived knowledge level of the customer service representative during the phone interaction. Checkbox selection on the satisfaction scale with options like "Very Satisfied", "Somewhat Satisfied" etc. - attributes: [] - - name: Request handling satisfaction rating - description: Measures the customer's level of satisfaction with the way their request was handled. Checkbox selection on the satisfaction scale with options like "Very Satisfied", "Somewhat Satisfied" etc. - attributes: [] - - name: Overall Satisfaction rating - description: This rating BEST describes the way the customer feels about the representative's response to their request for asssistance. Checkbox selection on the satisfaction scale with options like "I was very satisfied", "I was somewhat satisfied" etc. - attributes: [] - - name: Future purchase intent - description: A measure of whether the user will continue to buy the product they contacted about. Checkbox selection with options like "I definitely would", "I probably would" etc. - attributes: [] - - name: Product recommendation intent - description: A measure of whether the caller is willing to recommend the product that they called about to others. Checkbox selection with options like "I definitely would", "I probably would" etc. - attributes: [] - - name: specification - description: A technical document detailing precise requirements, measurements, standards, and implementation criteria - attributes: - - name: product_name - description: The name of the item being specified. Look for text labeled as 'product', 'item', or 'model', typically appearing prominently at the beginning. - - name: version - description: The iteration or release number. May be indicated by 'version', 'revision', or 'release', often followed by a number or code. - - name: technical_details - description: Specific characteristics and capabilities. Look for sections labeled 'specifications', 'tech specs', or 'details', often presented in a detailed list. - - name: requirements - description: Necessary conditions or resources. May be introduced with 'requirements', 'prerequisites', or 'needed'. - - name: compatibility - description: What the product can work with. Look for text following 'compatible with', 'works with', or 'supports'. - - name: dimensions - description: Physical measurements of the product. Often labeled as 'dimensions', 'size', or 'measurements', usually including length, width, height, etc. - - name: materials - description: What the product is made from. May be indicated by 'materials', 'composition', or phrases like 'made from'. - - name: standards - description: Industry guidelines or certifications met. Look for references to 'standards', 'certifications', or 'compliance'. - - name: revision_history - description: Record of changes to the specification. Often labeled as 'revisions', 'changes', or 'updates', typically in a table format. - - name: approval_info - description: Details about who has validated the specification. May be indicated by phrases like 'approved by', 'certified by', or 'validated'. - - name: generic - description: An unstructured document lacking distinctive formatting or purpose-specific elements of other categories - attributes: - - name: document_type - description: The classification or category of the document. Look for terms like 'type', 'category', or 'class' that indicate what kind of document this is. - - name: document_date - description: The date when the document was created. May be labeled as 'date', 'created on', or 'issued on'. - - name: document_id - description: A unique identifier for the document. Look for 'id', 'reference', or 'number', typically appearing near the top of the document. - - name: title - description: The name or heading of the document. Often appears prominently at the beginning, may be labeled as 'title', 'heading', or 'subject'. - - name: author - description: The person who created the document. Look for 'author', 'creator', or 'sender'. - - name: recipient - description: The person for whom the document is intended. May be indicated by 'recipient', 'to', or 'addressee'. - - name: content_summary - description: A brief description of the document's contents. Look for 'summary', 'abstract', or 'overview', typically appearing early in the document. - - name: status - description: The current state of the document, such as 'draft', 'final', 'pending', etc. May be labeled as 'status', 'state', or 'condition'. - - name: department - description: The organizational unit associated with the document. Often abbreviated as 'dept' or may appear as 'department' or 'division'. - - name: comments - description: Additional notes or remarks about the document. Look for sections labeled 'notes', 'remarks', or 'comments'. + invoice_date: + type: string + description: >- + The date when the invoice was issued. May be labeled as 'date', + 'invoice date', or 'billing date'. + due_date: + type: string + description: >- + The deadline by which payment must be made. Look for 'due date', + 'payment due', or 'payable by'. + vendor_name: + type: string + description: >- + The name of the business providing goods or services. May be labeled + as 'vendor', 'seller', 'supplier', or simply appear prominently at the + top of the invoice. + vendor_address: + type: string + description: >- + The physical location of the vendor. Look for 'address', 'location', + or 'business address', typically near the vendor name. + customer_name: + type: string + description: >- + The name of the person or entity being billed. Often preceded by + 'customer', 'buyer', or 'bill to'. + customer_address: + type: string + description: >- + The address where the invoice is sent or goods are delivered. May be + labeled as 'billing address' or 'ship to'. + items: + type: string + description: >- + Descriptions of the products or services provided. Look for a section + with 'description', 'item details', or 'products', usually in a table + format. + quantities: + type: string + description: >- + The number of each item provided. Often abbreviated as 'qty' or may + appear as 'quantity' or 'amount' in a table column. + unit_prices: + type: string + description: >- + The cost per unit of each item. May be labeled as 'price', 'rate', or + 'unit cost'. + subtotal: + type: string + description: >- + The sum of all items before tax and other charges. Look for 'subtotal' + or 'net amount', typically found toward the bottom of the invoice. + tax: + type: string + description: >- + The amount of tax charged on the invoice. May be labeled as 'tax', + 'VAT', or 'GST', usually appearing after the subtotal. + total_amount: + type: string + description: >- + The final amount to be paid including all charges. Look for 'total', + 'grand total', or 'amount due', typically the last figure on the + invoice. + payment_terms: + type: string + description: >- + The conditions under which payment should be made, such as '30 days', + 'COD', etc. Often labeled as 'terms', 'payment terms', or + 'conditions'. + po_number: + type: string + description: >- + The purchase order reference number. May be abbreviated as 'PO' or + appear as 'purchase order' or 'order reference'. + - $schema: https://json-schema.org/draft/2020-12/schema + $id: resume + x-aws-idp-document-type: resume + type: object + description: >- + A professional summary showcasing work experience, education, skills, and + achievements for job applications + properties: + full_name: + type: string + description: >- + The complete name of the job applicant, typically appearing + prominently at the top of the resume. May be simply labeled as 'name' + or 'applicant name'. + contact_info: + type: string + description: >- + The phone number, email, and address of the applicant. Look for a + section with 'contact', 'phone', 'email', or 'address', usually near + the top of the resume. + objective: + type: string + description: >- + A statement outlining the applicant's career goals. May be labeled as + 'objective', 'summary', or 'profile', typically appearing early in the + resume. + education: + type: string + description: >- + The academic history and qualifications of the applicant. Look for a + section with 'education', 'academic background', or 'qualifications'. + experience: + type: string + description: >- + The work history and previous roles of the applicant. Often labeled as + 'experience', 'work history', or 'employment'. + skills: + type: string + description: >- + The abilities and competencies of the applicant. Look for a section + titled 'skills', 'competencies', or 'expertise'. + certifications: + type: string + description: >- + Professional credentials and qualifications. May be labeled as + 'certifications', 'certificates', or 'credentials'. + languages: + type: string + description: >- + Languages known and level of proficiency. Often appears in a section + labeled 'languages' or 'language proficiency'. + references: + type: string + description: >- + People who can vouch for the applicant's abilities. Look for + 'references' or 'referees', typically at the end of the resume. + achievements: + type: string + description: >- + Notable accomplishments and recognition. May be labeled as + 'achievements', 'accomplishments', or 'awards'. + - $schema: https://json-schema.org/draft/2020-12/schema + $id: scientific_publication + x-aws-idp-document-type: scientific_publication + type: object + description: >- + A peer-reviewed academic document with abstract, methodology, results, + citations, and research findings + properties: + title: + type: string + description: >- + The name of the scientific paper, typically appearing prominently at + the beginning. May be labeled as 'title', 'paper title', or 'article + title'. + authors: + type: string + description: >- + The researchers who conducted the study and wrote the paper. Look for + names after 'authors', 'contributors', or 'researchers', usually + following the title. + abstract: + type: string + description: >- + A brief summary of the paper's content. Often labeled as 'abstract' or + 'summary', appearing before the main text. + keywords: + type: string + description: >- + Terms that represent the core topics of the paper. Look for a list + labeled 'keywords' or 'key terms', typically after the abstract. + publication_date: + type: string + description: >- + The date when the paper was published. May be preceded by 'published' + or labeled as 'publication date'. + journal_name: + type: string + description: >- + The name of the journal where the paper was published. Look for text + following 'journal' or 'publication'. + volume: + type: string + description: >- + The volume number of the journal. Often abbreviated as 'vol' or may + appear as 'volume'. + issue: + type: string + description: >- + The issue number of the journal. May be labeled as 'issue' or + abbreviated as 'no'. + pages: + type: string + description: >- + The page range of the paper in the journal. Often abbreviated as 'pp' + or may appear as 'pages'. + doi: + type: string + description: >- + The Digital Object Identifier for the paper, a unique alphanumeric + string. Look for 'DOI' or 'digital object identifier'. + funding: + type: string + description: >- + Financial support received for the research. May be indicated by + 'funding', 'grants', or 'financial support'. + corresponding_author: + type: string + description: >- + The author responsible for communication regarding the paper. Look for + 'corresponding author' or 'contact author'. + institutions: + type: string + description: >- + The organizations with which the authors are affiliated. May be + labeled as 'affiliations' or 'institutions'. + - $schema: https://json-schema.org/draft/2020-12/schema + $id: memo + x-aws-idp-document-type: memo + type: object + description: >- + An internal business communication with TO/FROM/DATE/SUBJECT headers for + organizational announcements or directives + properties: + memo_date: + type: string + description: >- + The date when the memo was written. Look for 'date' or 'memo date', + typically near the top of the document. + from: + type: string + description: >- + The person or department that wrote the memo. May be labeled as + 'from', 'sender', or 'author'. + to: + type: string + description: >- + The intended recipient of the memo. Look for text after 'to', + 'recipient', or 'addressee'. + subject: + type: string + description: >- + The topic of the memo. Often preceded by 'subject', 'RE:', or + 'regarding'. + memo_type: + type: string + description: >- + The category or classification of the memo, such as 'informational', + 'directive', etc. May be indicated by 'type' or 'category'. + priority: + type: string + description: >- + The urgency level of the memo, such as 'urgent', 'high', 'normal', + etc. Look for 'priority' or 'urgency'. + distribution_list: + type: string + description: >- + Additional people who receive copies of the memo. May be labeled as + 'distribution', 'cc', or 'copy'. + reference_number: + type: string + description: >- + An identifying number or code for the memo. Look for 'reference' or + 'ref no'. + department: + type: string + description: >- + The organizational unit issuing the memo. Often abbreviated as 'dept' + or may appear as 'department' or 'division'. + action_required: + type: string + description: >- + Steps that should be taken in response to the memo. Look for 'action', + 'response needed', or 'next steps'. + - $schema: https://json-schema.org/draft/2020-12/schema + $id: advertisement + x-aws-idp-document-type: advertisement + type: object + description: >- + A promotional material featuring product images, marketing text, + calls-to-action, and branding elements + properties: + product_name: + type: string + description: >- + The name of the item or service being advertised. Look for prominently + displayed text that could be a 'product', 'item', or 'service' name. + brand: + type: string + description: >- + The company or manufacturer of the product. May be indicated by a logo + or text labeled as 'brand', 'company', or 'manufacturer'. + price: + type: string + description: >- + The cost of the product or service. Look for currency symbols or + numbers followed by terms like 'price', 'cost', or 'special offer'. + promotion_details: + type: string + description: >- + Information about special deals or discounts. May be introduced with + 'promotion', 'offer', or 'deal'. + validity_period: + type: string + description: >- + The timeframe during which the offer is valid. Look for phrases like + 'valid until', 'offer ends', or 'expires'. + contact_info: + type: string + description: >- + How to reach the advertiser. May include phone numbers, websites, or + addresses following 'contact', 'call', or 'visit'. + features: + type: string + description: >- + Notable qualities or benefits of the product. Often listed under + 'features', 'benefits', or 'highlights'. + terms_conditions: + type: string + description: >- + Legal constraints or limitations of the offer. Look for fine print + labeled as 'terms', 'conditions', or 'restrictions'. + call_to_action: + type: string + description: >- + What the advertisement encourages the reader to do. Often appears as + imperative phrases like 'call now', 'visit today', or 'order now'. + disclaimer: + type: string + description: >- + Legal statements limiting liability or making clarifications. Usually + appears as fine print introduced by 'disclaimer' or phrases like + 'terms apply' or 'conditions apply'. + - $schema: https://json-schema.org/draft/2020-12/schema + $id: email + x-aws-idp-document-type: email + type: object + description: >- + A digital message with email headers (To/From/Subject), timestamps, and + conversational threading + properties: + from_address: + type: string + description: >- + The email address of the sender. Look for text following 'from', + 'sender', or 'sent by', typically at the beginning of the email + header. + to_address: + type: string + description: >- + The email address of the primary recipient. May be labeled as 'to', + 'recipient', or 'sent to'. + cc_address: + type: string + description: >- + Email addresses of additional recipients who receive copies. Look for + 'cc' or 'carbon copy' followed by one or more email addresses. + bcc_address: + type: string + description: >- + Email addresses of hidden recipients. May be labeled as 'bcc' or + 'blind copy'. + subject: + type: string + description: >- + The topic of the email. Often preceded by 'subject', 'RE:', or + 'regarding'. + date_sent: + type: string + description: >- + The date and time when the email was sent. Look for 'date', 'sent on', + or 'received', typically in the email header. + attachments: + type: string + description: >- + Files included with the email. May be indicated by 'attached', + 'attachment', or 'enclosed', often with icons or file names. + priority: + type: string + description: >- + The urgency level of the email, such as 'high', 'normal', etc. Look + for 'priority' or 'importance'. + thread_id: + type: string + description: >- + An identifier for the email conversation. May be labeled as 'thread' + or 'conversation', typically not visible to regular users. + message_id: + type: string + description: >- + A unique identifier for the specific email. Look for 'message id' or + 'email id', usually hidden in the email metadata. + - $schema: https://json-schema.org/draft/2020-12/schema + $id: questionnaire + x-aws-idp-document-type: questionnaire + type: object + description: >- + A survey instrument containing numbered questions with multiple choice, + rating scales, or open-ended responses + properties: + form_title: + type: string + description: >- + The name or title of the questionnaire. Look for prominently displayed + text at the beginning that could be a 'title', 'survey name', or + 'questionnaire name'. + respondent_info: + type: string + description: >- + Information about the person completing the questionnaire. May include + fields labeled 'respondent', 'participant', or 'name'. + submission_date: + type: string + description: >- + The date when the questionnaire was completed. Look for 'date', + 'completed on', or 'submitted'. + section_headers: + type: string + description: >- + Titles for different segments of the questionnaire. Often appear as + bold or larger text introducing a new 'section', 'part', or 'segment'. + question_types: + type: string + description: >- + The format of questions (multiple choice, free text, etc.). May be + indicated by 'type', 'question format', or 'response format'. + response_options: + type: string + description: >- + Possible answers for multiple-choice questions. Look for checkboxes, + radio buttons, or dropdown menus with 'options', 'choices', or + 'answers'. + required_fields: + type: string + description: >- + Questions that must be answered to complete the questionnaire. Often + marked with an asterisk (*) or explicitly labeled as 'required', + 'mandatory', or 'must answer'. + instructions: + type: string + description: >- + Guidance on how to complete the questionnaire. Look for text + introduced by 'instructions', 'directions', or 'guidelines'. + survey_id: + type: string + description: >- + A unique identifier for the questionnaire. May be labeled as 'survey + id', 'reference number', or 'form id'. + completion_status: + type: string + description: >- + Whether the questionnaire has been fully completed. Look for + indicators of 'status', 'completion', or 'progress', often shown as a + percentage or progress bar. + Phone Call Representative Courtesy: + type: string + description: >- + Measures the perceived politeness and professionalism of the customer + service representative during the phone interaction. Checkbox + selection on the satisfaction scale with options like "Very + Satisfied", "Somewhat Satisfied" etc. + x-aws-idp-evaluation-method: "" + Phone call representative knowledge rating: + type: string + description: >- + Measures the perceived knowledge level of the customer service + representative during the phone interaction. Checkbox selection on the + satisfaction scale with options like "Very Satisfied", "Somewhat + Satisfied" etc. + Request handling satisfaction rating: + type: string + description: >- + Measures the customer's level of satisfaction with the way their + request was handled. Checkbox selection on the satisfaction scale with + options like "Very Satisfied", "Somewhat Satisfied" etc. + Overall Satisfaction rating: + type: string + description: >- + This rating BEST describes the way the customer feels about the + representative's response to their request for asssistance. Checkbox + selection on the satisfaction scale with options like "I was very + satisfied", "I was somewhat satisfied" etc. + Future purchase intent: + type: string + description: >- + A measure of whether the user will continue to buy the product they + contacted about. Checkbox selection with options like "I definitely + would", "I probably would" etc. + Product recommendation intent: + type: string + description: >- + A measure of whether the caller is willing to recommend the product + that they called about to others. Checkbox selection with options like + "I definitely would", "I probably would" etc. + - $schema: https://json-schema.org/draft/2020-12/schema + $id: specification + x-aws-idp-document-type: specification + type: object + description: >- + A technical document detailing precise requirements, measurements, + standards, and implementation criteria + properties: + product_name: + type: string + description: >- + The name of the item being specified. Look for text labeled as + 'product', 'item', or 'model', typically appearing prominently at the + beginning. + version: + type: string + description: >- + The iteration or release number. May be indicated by 'version', + 'revision', or 'release', often followed by a number or code. + technical_details: + type: string + description: >- + Specific characteristics and capabilities. Look for sections labeled + 'specifications', 'tech specs', or 'details', often presented in a + detailed list. + requirements: + type: string + description: >- + Necessary conditions or resources. May be introduced with + 'requirements', 'prerequisites', or 'needed'. + compatibility: + type: string + description: >- + What the product can work with. Look for text following 'compatible + with', 'works with', or 'supports'. + dimensions: + type: string + description: >- + Physical measurements of the product. Often labeled as 'dimensions', + 'size', or 'measurements', usually including length, width, height, + etc. + materials: + type: string + description: >- + What the product is made from. May be indicated by 'materials', + 'composition', or phrases like 'made from'. + standards: + type: string + description: >- + Industry guidelines or certifications met. Look for references to + 'standards', 'certifications', or 'compliance'. + revision_history: + type: string + description: >- + Record of changes to the specification. Often labeled as 'revisions', + 'changes', or 'updates', typically in a table format. + approval_info: + type: string + description: >- + Details about who has validated the specification. May be indicated by + phrases like 'approved by', 'certified by', or 'validated'. + - $schema: https://json-schema.org/draft/2020-12/schema + $id: generic + x-aws-idp-document-type: generic + type: object + description: >- + An unstructured document lacking distinctive formatting or + purpose-specific elements of other categories + properties: + document_type: + type: string + description: >- + The classification or category of the document. Look for terms like + 'type', 'category', or 'class' that indicate what kind of document + this is. + document_date: + type: string + description: >- + The date when the document was created. May be labeled as 'date', + 'created on', or 'issued on'. + document_id: + type: string + description: >- + A unique identifier for the document. Look for 'id', 'reference', or + 'number', typically appearing near the top of the document. + title: + type: string + description: >- + The name or heading of the document. Often appears prominently at the + beginning, may be labeled as 'title', 'heading', or 'subject'. + author: + type: string + description: >- + The person who created the document. Look for 'author', 'creator', or + 'sender'. + recipient: + type: string + description: >- + The person for whom the document is intended. May be indicated by + 'recipient', 'to', or 'addressee'. + content_summary: + type: string + description: >- + A brief description of the document's contents. Look for 'summary', + 'abstract', or 'overview', typically appearing early in the document. + status: + type: string + description: >- + The current state of the document, such as 'draft', 'final', + 'pending', etc. May be labeled as 'status', 'state', or 'condition'. + department: + type: string + description: >- + The organizational unit associated with the document. Often + abbreviated as 'dept' or may appear as 'department' or 'division'. + comments: + type: string + description: >- + Additional notes or remarks about the document. Look for sections + labeled 'notes', 'remarks', or 'comments'. classification: maxPagesForClassification: "ALL" image: - target_height: '' - target_width: '' - top_p: '0.1' - max_tokens: '4096' - top_k: '5' + target_height: "" + target_width: "" + top_p: "0.1" + max_tokens: "4096" + top_k: "5" task_prompt: >- @@ -444,21 +900,21 @@ classification: Remember: You must ONLY use document types that appear in the reference data. Do not invent or create new document types. - temperature: '0.0' + temperature: "0.0" model: us.amazon.nova-pro-v1:0 system_prompt: >- You are a document classification expert who can analyze and classify multiple documents and their page boundaries within a document package from various domains. Your task is to determine the document type based on its content and structure, using the provided document type definitions. Your output must be valid JSON according to the requested format. classificationMethod: textbasedHolisticClassification extraction: image: - target_width: '' - target_height: '' - top_p: '0.1' - max_tokens: '10000' - top_k: '5' + target_width: "" + target_height: "" + top_p: "0.1" + max_tokens: "10000" + top_k: "5" task_prompt: >- - + You are an expert in document analysis and information extraction. You can understand and extract key information from documents classified as type @@ -521,7 +977,7 @@ extraction: {DOCUMENT_TEXT} - + @@ -542,15 +998,15 @@ extraction: 7. Think step by step before finalizing your answer - temperature: '0.0' + temperature: "0.0" model: us.amazon.nova-pro-v1:0 system_prompt: >- You are a document assistant. Respond only with JSON. Never make up data, only provide data found in the document being provided. summarization: enabled: true - top_p: '0.1' - max_tokens: '4096' - top_k: '5' + top_p: "0.1" + max_tokens: "4096" + top_k: "5" task_prompt: >- @@ -604,7 +1060,7 @@ summarization: Do not include any text, explanations, or notes outside of this JSON structure. The JSON must be properly formatted and parseable. - temperature: '0.0' + temperature: "0.0" model: us.anthropic.claude-3-7-sonnet-20250219-v1:0 system_prompt: >- You are a document summarization expert who can analyze and summarize documents from various domains including medical, financial, legal, and general business documents. Your task is to create a summary that captures the key information, main points, and important details from the document. Your output must be in valid JSON format. \nSummarization Style: Balanced\\nCreate a balanced summary that provides a moderate level of detail. Include the main points and key supporting information, while maintaining the document's overall structure. Aim for a comprehensive yet concise summary.\n Your output MUST be in valid JSON format with markdown content. You MUST strictly adhere to the output format specified in the instructions. @@ -612,22 +1068,22 @@ assessment: enabled: true validation_enabled: false image: - target_height: '' - target_width: '' + target_height: "" + target_width: "" granular: enabled: true max_workers: "20" simple_batch_size: "3" list_batch_size: "1" - default_confidence_threshold: '0.8' - top_p: '0.1' - max_tokens: '10000' - top_k: '5' - temperature: '0.0' + default_confidence_threshold: "0.8" + top_p: "0.1" + max_tokens: "10000" + top_k: "5" + temperature: "0.0" model: us.amazon.nova-lite-v1:0 system_prompt: >- You are a document analysis assessment expert. Your role is to evaluate the confidence and accuracy of data extraction results by analyzing them against source documents. - + Provide accurate confidence scores for each assessment. When bounding boxes are requested, provide precise coordinate locations where information appears in the document. task_prompt: >- @@ -669,7 +1125,7 @@ assessment: For each field, provide bounding box coordinates: - bbox: [x1, y1, x2, y2] coordinates in normalized 0-1000 scale - page: Page number where the field appears (starting from 1) - + Coordinate system: - Use normalized scale 0-1000 for both x and y axes - x1, y1 = top-left corner of bounding box @@ -768,9 +1224,9 @@ assessment: evaluation: enabled: true llm_method: - top_p: '0.1' - max_tokens: '4096' - top_k: '5' + top_p: "0.1" + max_tokens: "4096" + top_k: "5" task_prompt: >- I need to evaluate attribute extraction for a document of class: {DOCUMENT_CLASS}. @@ -800,7 +1256,7 @@ evaluation: "score": 0.0 to 1.0, "reason": "Your explanation here" } - temperature: '0.0' + temperature: "0.0" model: us.anthropic.claude-3-haiku-20240307-v1:0 system_prompt: >- You are an evaluator that helps determine if the predicted and expected values match for document attribute extraction. You will consider the context and meaning rather than just exact string matching. @@ -850,8 +1306,8 @@ discovery: ] } with_ground_truth: - top_p: '0.1' - temperature: '1.0' + top_p: "0.1" + temperature: "1.0" user_prompt: >- This image contains unstructured data. Analyze the data line by line using the provided ground truth as reference. @@ -876,10 +1332,10 @@ discovery: documents. Use provided ground truth data as reference to optimize field extraction and ensure consistency with expected document structure and field definitions. - max_tokens: '10000' + max_tokens: "10000" without_ground_truth: - top_p: '0.1' - temperature: '1.0' + top_p: "0.1" + temperature: "1.0" user_prompt: >- This image contains forms data. Analyze the form line by line. Image may contains multiple pages, process all the pages. @@ -905,7 +1361,7 @@ discovery: documents. Analyze forms line by line to identify field names, data types, and organizational structure. Focus on creating comprehensive blueprints for document processing without extracting actual values. - max_tokens: '10000' + max_tokens: "10000" agents: error_analyzer: model_id: us.anthropic.claude-sonnet-4-20250514-v1:0 @@ -919,15 +1375,15 @@ agents: 3. Collect relevant logs from CloudWatch 4. Identify any performance issues from X-Ray traces 5. Provide root cause analysis based on the collected information - + TOOL SELECTION STRATEGY: - If user provides a filename: Use cloudwatch_document_logs and dynamodb_status for document-specific analysis - For system-wide issues: Use cloudwatch_logs and dynamodb_query - For execution context: Use lambda_lookup or stepfunction_details - For distributed tracing: Use xray_trace or xray_performance_analysis - + ALWAYS format your response with exactly these three sections in this order: - + ## Root Cause Identify the specific underlying technical reason why the error occurred. Focus on the primary cause, not symptoms. @@ -936,16 +1392,16 @@ agents:
Evidence - + Format evidence with source information. Include relevant data from tool responses: - + **For CloudWatch logs:** **Log Group:** [full log_group name] **Log Stream:** [full log_stream name] ``` [ERROR] timestamp message ``` - + **For other sources (DynamoDB, Step Functions, X-Ray):** **Source:** [service name and resource] ``` @@ -967,14 +1423,14 @@ agents: - Use system-wide tools for pattern analysis - Combine DynamoDB status with CloudWatch logs for complete picture - Leverage X-Ray for distributed system issues - + ROOT CAUSE DETERMINATION: 1. Document Status: Check dynamodb_status first 2. Execution Details: Use stepfunction_details for workflow failures 3. Log Analysis: Use cloudwatch_document_logs or cloudwatch_logs for error details 4. Distributed Tracing: Use xray_performance_analysis for service interaction issues 5. Context: Use lambda_lookup for execution environment - + RECOMMENDATION GUIDELINES: For code-related issues or system bugs: - Do not suggest code modifications @@ -993,7 +1449,7 @@ agents: - last week: 168 hours - last day: 24 hours - No time specified: 24 hours (default) - + IMPORTANT: Do not include any search quality reflections, search quality scores, or meta-analysis sections in your response. Only provide the three required sections: Root Cause, Recommendations, and Evidence. parameters: max_log_events: 5 @@ -1002,252 +1458,252 @@ pricing: - name: textract/detect_document_text units: - name: pages - price: '0.0015' + price: "0.0015" - name: textract/analyze_document-Layout units: - name: pages - price: '0.004' + price: "0.004" - name: textract/analyze_document-Signatures units: - name: pages - price: '0.0035' + price: "0.0035" - name: textract/analyze_document-Forms units: - name: pages - price: '0.05' + price: "0.05" - name: textract/analyze_document-Tables units: - name: pages - price: '0.015' + price: "0.015" - name: textract/analyze_document-Tables+Forms units: - name: pages - price: '0.065' + price: "0.065" - name: bedrock/us.amazon.nova-lite-v1:0 units: - name: inputTokens - price: '6.0E-8' + price: "6.0E-8" - name: outputTokens - price: '2.4E-7' + price: "2.4E-7" - name: cacheReadInputTokens - price: '1.5E-8' + price: "1.5E-8" - name: cacheWriteInputTokens - price: '6.0E-8' + price: "6.0E-8" - name: bedrock/us.amazon.nova-pro-v1:0 units: - name: inputTokens - price: '8.0E-7' + price: "8.0E-7" - name: outputTokens - price: '3.2E-6' + price: "3.2E-6" - name: cacheReadInputTokens - price: '2.0E-7' + price: "2.0E-7" - name: cacheWriteInputTokens - price: '8.0E-7' + price: "8.0E-7" - name: bedrock/us.amazon.nova-premier-v1:0 units: - name: inputTokens - price: '2.5E-6' + price: "2.5E-6" - name: outputTokens - price: '1.25E-5' + price: "1.25E-5" - name: bedrock/us.anthropic.claude-3-haiku-20240307-v1:0 units: - name: inputTokens - price: '2.5E-7' + price: "2.5E-7" - name: outputTokens - price: '1.25E-6' + price: "1.25E-6" - name: bedrock/us.anthropic.claude-3-5-haiku-20241022-v1:0 units: - name: inputTokens - price: '8.0E-7' + price: "8.0E-7" - name: outputTokens - price: '4.0E-6' + price: "4.0E-6" - name: cacheReadInputTokens - price: '8.0E-8' + price: "8.0E-8" - name: cacheWriteInputTokens - price: '1.0E-6' + price: "1.0E-6" - name: bedrock/us.anthropic.claude-haiku-4-5-20251001-v1:0 units: - name: inputTokens - price: '1.1E-06' + price: "1.1E-06" - name: outputTokens - price: '5.5E-06' + price: "5.5E-06" - name: cacheReadInputTokens - price: '1.1E-07' + price: "1.1E-07" - name: cacheWriteInputTokens - price: '1.4E-06' + price: "1.4E-06" - name: bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0:1m units: - name: inputTokens - price: '6.0E-6' + price: "6.0E-6" - name: outputTokens - price: '2.25E-5' + price: "2.25E-5" - name: cacheReadInputTokens - price: '6.0E-7' + price: "6.0E-7" - name: cacheWriteInputTokens - price: '7.5E-6' + price: "7.5E-6" - name: bedrock/us.anthropic.claude-sonnet-4-5-20250929-v1:0 units: - name: inputTokens - price: '3.3E-6' + price: "3.3E-6" - name: outputTokens - price: '1.65E-5' + price: "1.65E-5" - name: cacheReadInputTokens - price: '3.3E-7' + price: "3.3E-7" - name: cacheWriteInputTokens - price: '4.125E-6' + price: "4.125E-6" - name: bedrock/us.anthropic.claude-sonnet-4-5-20250929-v1:0:1m units: - name: inputTokens - price: '6.6E-6' + price: "6.6E-6" - name: outputTokens - price: '2.475E-5' + price: "2.475E-5" - name: cacheReadInputTokens - price: '6.6E-7' + price: "6.6E-7" - name: cacheWriteInputTokens - price: '8.25E-6' + price: "8.25E-6" - name: bedrock/us.anthropic.claude-opus-4-20250514-v1:0 units: - name: inputTokens - price: '1.5E-5' + price: "1.5E-5" - name: outputTokens - price: '7.5E-5' + price: "7.5E-5" - name: cacheReadInputTokens - price: '1.5E-6' + price: "1.5E-6" - name: cacheWriteInputTokens - price: '1.875E-5' + price: "1.875E-5" - name: bedrock/us.anthropic.claude-opus-4-1-20250805-v1:0 units: - name: inputTokens - price: '1.5E-5' + price: "1.5E-5" - name: outputTokens - price: '7.5E-5' + price: "7.5E-5" - name: cacheReadInputTokens - price: '1.5E-6' + price: "1.5E-6" - name: cacheWriteInputTokens - price: '1.875E-5' + price: "1.875E-5" # EU model pricing - name: bedrock/eu.amazon.nova-lite-v1:0 units: - name: inputTokens - price: '7.8E-8' + price: "7.8E-8" - name: outputTokens - price: '3.1E-7' + price: "3.1E-7" - name: cacheReadInputTokens - price: '1.9E-8' + price: "1.9E-8" - name: cacheWriteInputTokens - price: '7.8E-8' + price: "7.8E-8" - name: bedrock/eu.amazon.nova-pro-v1:0 units: - name: inputTokens - price: '1.0E-6' + price: "1.0E-6" - name: outputTokens - price: '4.2E-6' + price: "4.2E-6" - name: cacheReadInputTokens - price: '2.6E-7' + price: "2.6E-7" - name: cacheWriteInputTokens - price: '1.0E-6' + price: "1.0E-6" - name: bedrock/eu.anthropic.claude-3-haiku-20240307-v1:0 units: - name: inputTokens - price: '2.5E-7' + price: "2.5E-7" - name: outputTokens - price: '1.25E-6' + price: "1.25E-6" - name: bedrock/eu.anthropic.claude-haiku-4-5-20251001-v1:0 units: - name: inputTokens - price: '1.1E-6' + price: "1.1E-6" - name: outputTokens - price: '5.5E-6' + price: "5.5E-6" - name: cacheReadInputTokens - price: '1.1E-7' + price: "1.1E-7" - name: cacheWriteInputTokens - price: '1.4E-6' + price: "1.4E-6" - name: bedrock/eu.anthropic.claude-3-5-sonnet-20241022-v2:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/eu.anthropic.claude-3-7-sonnet-20250219-v1:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/eu.anthropic.claude-sonnet-4-20250514-v1:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/eu.anthropic.claude-sonnet-4-5-20250929-v1:0 units: - name: inputTokens - price: '3.3E-6' + price: "3.3E-6" - name: outputTokens - price: '1.65E-5' + price: "1.65E-5" - name: cacheReadInputTokens - price: '3.3E-7' + price: "3.3E-7" - name: cacheWriteInputTokens - price: '4.125E-6' + price: "4.125E-6" - name: bedrock/eu.anthropic.claude-sonnet-4-5-20250929-v1:0:1m units: - name: inputTokens - price: '6.6E-6' + price: "6.6E-6" - name: outputTokens - price: '2.475E-5' + price: "2.475E-5" - name: cacheReadInputTokens - price: '6.6E-7' + price: "6.6E-7" - name: cacheWriteInputTokens - price: '8.25E-6' + price: "8.25E-6" # AWS Lambda pricing (US East - N. Virginia) - name: lambda/requests units: - name: invocations - price: '2.0E-7' # $0.0000002 per request ($0.20 per 1M requests) - - name: lambda/duration + price: "2.0E-7" # $0.0000002 per request ($0.20 per 1M requests) + - name: lambda/duration units: - name: gb_seconds - price: '1.66667E-5' # $0.0000166667 per GB-second ($16.67 per 1M GB-seconds) + price: "1.66667E-5" # $0.0000166667 per GB-second ($16.67 per 1M GB-seconds) diff --git a/config_library/pattern-3/rvl-cdip-package-sample/config.yaml b/config_library/pattern-3/rvl-cdip-package-sample/config.yaml index 073875275..f4d6ecd11 100644 --- a/config_library/pattern-3/rvl-cdip-package-sample/config.yaml +++ b/config_library/pattern-3/rvl-cdip-package-sample/config.yaml @@ -3,7 +3,7 @@ notes: Default settings ocr: - backend: "textract" # Default to Textract for backward compatibility + backend: "textract" # Default to Textract for backward compatibility model_id: "us.anthropic.claude-3-7-sonnet-20250219-v1:0" system_prompt: "You are an expert OCR system. Extract all text from the provided image accurately, preserving layout where possible." task_prompt: "Extract all text from this document image. Preserve the layout, including paragraphs, tables, and formatting." @@ -12,313 +12,769 @@ ocr: - name: TABLES - name: SIGNATURES image: - dpi: '150' - target_width: '' - target_height: '' + dpi: "150" + target_width: "" + target_height: "" classes: - - name: letter - description: A formal written correspondence with sender/recipient addresses, date, salutation, body, and closing signature - attributes: - - name: sender_name - description: The name of the person or entity who wrote or sent the letter. Look for text following or near terms like 'from', 'sender', 'authored by', 'written by', or at the end of the letter before a signature. - confidence_threshold: '0.85' - - name: sender_address - description: The physical address of the sender, typically appearing at the top of the letter. May be labeled as 'address', 'location', or 'from address'. - confidence_threshold: '0.8' - - name: recipient_name - description: The name of the person or entity receiving the letter. Look for this after 'to', 'recipient', 'addressee', or at the beginning of the letter. - confidence_threshold: '0.9' - - name: recipient_address - description: The physical address where the letter is to be delivered. Often labeled as 'to address' or 'delivery address', typically appearing below the recipient name. - - name: date - description: The date when the letter was written. Look for a standalone date or text following phrases like 'written on' or 'dated'. - - name: subject - description: The topic or main point of the letter. Often preceded by 'subject', 'RE:', or 'regarding'. - - name: letter_type - description: The category or classification of the letter, such as 'complaint', 'inquiry', 'invitation', etc. May be indicated by 'type' or 'category'. - - name: signature - description: The handwritten name or mark of the sender at the end of the letter. May follow terms like 'signed by' or simply appear at the bottom of the document. - - name: cc - description: Names of people who receive a copy of the letter in addition to the main recipient. Often preceded by 'cc', 'carbon copy', or 'copy to'. - - name: reference_number - description: An identifying number or code associated with the letter. Look for labels like 'ref', 'reference', or 'our ref'. - - name: form - description: A structured document with labeled fields, checkboxes, or blanks requiring user input and completion - attributes: - - name: form_type - description: The category or purpose of the form, such as 'application', 'registration', 'request', etc. May be identified by 'form name', 'document type', or 'form category'. - - name: form_id - description: The unique identifier for the form, typically a number or alphanumeric code. Often labeled as 'form number', 'id', or 'reference number'. - - name: submission_date - description: The date when the form was submitted or filed. Look for text near 'date', 'submitted on', or 'filed on'. - - name: submitter_name - description: The name of the person who submitted the form. May be labeled as 'name', 'submitted by', or 'filed by'. - - name: submitter_id - description: An identification number for the person submitting the form, such as social security number, employee ID, etc. Often labeled as 'id number', 'identification', or 'reference'. - - name: approval_status - description: The current state of approval for the form, such as 'approved', 'pending', 'rejected', etc. Look for terms like 'status', 'approved', or 'pending'. - - name: processed_by - description: The name of the person or department that processed the form. May be indicated by 'processor', 'handled by', or 'approved by'. - - name: processing_date - description: The date when the form was processed or completed. Look for labels like 'processed on' or 'completion date'. - - name: department - description: The organizational unit responsible for the form. Often abbreviated as 'dept' or may appear as 'department' or 'division'. - - name: comments - description: Additional notes or remarks about the form. Look for sections labeled 'notes', 'remarks', or 'comments'. - - name: invoice - description: A billing document listing items/services, quantities, prices, payment terms, and transaction totals - attributes: - - name: invoice_number + - $schema: https://json-schema.org/draft/2020-12/schema + $id: letter + x-aws-idp-document-type: letter + type: object + description: >- + A formal written correspondence with sender/recipient addresses, date, + salutation, body, and closing signature + properties: + sender_name: + type: string + description: >- + The name of the person or entity who wrote or sent the letter. Look + for text following or near terms like 'from', 'sender', 'authored by', + 'written by', or at the end of the letter before a signature. + x-aws-idp-confidence-threshold: 0.85 + sender_address: + type: string + description: >- + The physical address of the sender, typically appearing at the top of + the letter. May be labeled as 'address', 'location', or 'from + address'. + x-aws-idp-confidence-threshold: 0.8 + recipient_name: + type: string + description: >- + The name of the person or entity receiving the letter. Look for this + after 'to', 'recipient', 'addressee', or at the beginning of the + letter. + x-aws-idp-confidence-threshold: 0.9 + recipient_address: + type: string + description: >- + The physical address where the letter is to be delivered. Often + labeled as 'to address' or 'delivery address', typically appearing + below the recipient name. + date: + type: string + description: >- + The date when the letter was written. Look for a standalone date or + text following phrases like 'written on' or 'dated'. + subject: + type: string + description: >- + The topic or main point of the letter. Often preceded by 'subject', + 'RE:', or 'regarding'. + letter_type: + type: string + description: >- + The category or classification of the letter, such as 'complaint', + 'inquiry', 'invitation', etc. May be indicated by 'type' or + 'category'. + signature: + type: string + description: >- + The handwritten name or mark of the sender at the end of the letter. + May follow terms like 'signed by' or simply appear at the bottom of + the document. + cc: + type: string + description: >- + Names of people who receive a copy of the letter in addition to the + main recipient. Often preceded by 'cc', 'carbon copy', or 'copy to'. + reference_number: + type: string + description: >- + An identifying number or code associated with the letter. Look for + labels like 'ref', 'reference', or 'our ref'. + - $schema: https://json-schema.org/draft/2020-12/schema + $id: form + x-aws-idp-document-type: form + type: object + description: >- + A structured document with labeled fields, checkboxes, or blanks requiring + user input and completion + properties: + form_type: + type: string + description: >- + The category or purpose of the form, such as 'application', + 'registration', 'request', etc. May be identified by 'form name', + 'document type', or 'form category'. + form_id: + type: string + description: >- + The unique identifier for the form, typically a number or alphanumeric + code. Often labeled as 'form number', 'id', or 'reference number'. + submission_date: + type: string + description: >- + The date when the form was submitted or filed. Look for text near + 'date', 'submitted on', or 'filed on'. + submitter_name: + type: string + description: >- + The name of the person who submitted the form. May be labeled as + 'name', 'submitted by', or 'filed by'. + submitter_id: + type: string + description: >- + An identification number for the person submitting the form, such as + social security number, employee ID, etc. Often labeled as 'id + number', 'identification', or 'reference'. + approval_status: + type: string + description: >- + The current state of approval for the form, such as 'approved', + 'pending', 'rejected', etc. Look for terms like 'status', 'approved', + or 'pending'. + processed_by: + type: string + description: >- + The name of the person or department that processed the form. May be + indicated by 'processor', 'handled by', or 'approved by'. + processing_date: + type: string + description: >- + The date when the form was processed or completed. Look for labels + like 'processed on' or 'completion date'. + department: + type: string + description: >- + The organizational unit responsible for the form. Often abbreviated as + 'dept' or may appear as 'department' or 'division'. + comments: + type: string + description: >- + Additional notes or remarks about the form. Look for sections labeled + 'notes', 'remarks', or 'comments'. + - $schema: https://json-schema.org/draft/2020-12/schema + $id: invoice + x-aws-idp-document-type: invoice + type: object + description: >- + A billing document listing items/services, quantities, prices, payment + terms, and transaction totals + properties: + invoice_number: + type: string description: The unique identifier for the invoice. Look for 'invoice no', 'invoice - - name: invoice_date - description: The date when the invoice was issued. May be labeled as 'date', 'invoice date', or 'billing date'. - - name: due_date - description: The deadline by which payment must be made. Look for 'due date', 'payment due', or 'payable by'. - - name: vendor_name - description: The name of the business providing goods or services. May be labeled as 'vendor', 'seller', 'supplier', or simply appear prominently at the top of the invoice. - - name: vendor_address - description: The physical location of the vendor. Look for 'address', 'location', or 'business address', typically near the vendor name. - - name: customer_name - description: The name of the person or entity being billed. Often preceded by 'customer', 'buyer', or 'bill to'. - - name: customer_address - description: The address where the invoice is sent or goods are delivered. May be labeled as 'billing address' or 'ship to'. - - name: items - description: Descriptions of the products or services provided. Look for a section with 'description', 'item details', or 'products', usually in a table format. - - name: quantities - description: The number of each item provided. Often abbreviated as 'qty' or may appear as 'quantity' or 'amount' in a table column. - - name: unit_prices - description: The cost per unit of each item. May be labeled as 'price', 'rate', or 'unit cost'. - - name: subtotal - description: The sum of all items before tax and other charges. Look for 'subtotal' or 'net amount', typically found toward the bottom of the invoice. - - name: tax - description: The amount of tax charged on the invoice. May be labeled as 'tax', 'VAT', or 'GST', usually appearing after the subtotal. - - name: total_amount - description: The final amount to be paid including all charges. Look for 'total', 'grand total', or 'amount due', typically the last figure on the invoice. - - name: payment_terms - description: The conditions under which payment should be made, such as '30 days', 'COD', etc. Often labeled as 'terms', 'payment terms', or 'conditions'. - - name: po_number - description: The purchase order reference number. May be abbreviated as 'PO' or appear as 'purchase order' or 'order reference'. - - name: resume - description: A professional summary showcasing work experience, education, skills, and achievements for job applications - attributes: - - name: full_name - description: The complete name of the job applicant, typically appearing prominently at the top of the resume. May be simply labeled as 'name' or 'applicant name'. - - name: contact_info - description: The phone number, email, and address of the applicant. Look for a section with 'contact', 'phone', 'email', or 'address', usually near the top of the resume. - - name: objective - description: A statement outlining the applicant's career goals. May be labeled as 'objective', 'summary', or 'profile', typically appearing early in the resume. - - name: education - description: The academic history and qualifications of the applicant. Look for a section with 'education', 'academic background', or 'qualifications'. - - name: experience - description: The work history and previous roles of the applicant. Often labeled as 'experience', 'work history', or 'employment'. - - name: skills - description: The abilities and competencies of the applicant. Look for a section titled 'skills', 'competencies', or 'expertise'. - - name: certifications - description: Professional credentials and qualifications. May be labeled as 'certifications', 'certificates', or 'credentials'. - - name: languages - description: Languages known and level of proficiency. Often appears in a section labeled 'languages' or 'language proficiency'. - - name: references - description: People who can vouch for the applicant's abilities. Look for 'references' or 'referees', typically at the end of the resume. - - name: achievements - description: Notable accomplishments and recognition. May be labeled as 'achievements', 'accomplishments', or 'awards'. - - name: scientific_publication - description: A peer-reviewed academic document with abstract, methodology, results, citations, and research findings - attributes: - - name: title - description: The name of the scientific paper, typically appearing prominently at the beginning. May be labeled as 'title', 'paper title', or 'article title'. - - name: authors - description: The researchers who conducted the study and wrote the paper. Look for names after 'authors', 'contributors', or 'researchers', usually following the title. - - name: abstract - description: A brief summary of the paper's content. Often labeled as 'abstract' or 'summary', appearing before the main text. - - name: keywords - description: Terms that represent the core topics of the paper. Look for a list labeled 'keywords' or 'key terms', typically after the abstract. - - name: publication_date - description: The date when the paper was published. May be preceded by 'published' or labeled as 'publication date'. - - name: journal_name - description: The name of the journal where the paper was published. Look for text following 'journal' or 'publication'. - - name: volume - description: The volume number of the journal. Often abbreviated as 'vol' or may appear as 'volume'. - - name: issue - description: The issue number of the journal. May be labeled as 'issue' or abbreviated as 'no'. - - name: pages - description: The page range of the paper in the journal. Often abbreviated as 'pp' or may appear as 'pages'. - - name: doi - description: The Digital Object Identifier for the paper, a unique alphanumeric string. Look for 'DOI' or 'digital object identifier'. - - name: funding - description: Financial support received for the research. May be indicated by 'funding', 'grants', or 'financial support'. - - name: corresponding_author - description: The author responsible for communication regarding the paper. Look for 'corresponding author' or 'contact author'. - - name: institutions - description: The organizations with which the authors are affiliated. May be labeled as 'affiliations' or 'institutions'. - - name: memo - description: An internal business communication with TO/FROM/DATE/SUBJECT headers for organizational announcements or directives - attributes: - - name: memo_date - description: The date when the memo was written. Look for 'date' or 'memo date', typically near the top of the document. - - name: from - description: The person or department that wrote the memo. May be labeled as 'from', 'sender', or 'author'. - - name: to - description: The intended recipient of the memo. Look for text after 'to', 'recipient', or 'addressee'. - - name: subject - description: The topic of the memo. Often preceded by 'subject', 'RE:', or 'regarding'. - - name: memo_type - description: The category or classification of the memo, such as 'informational', 'directive', etc. May be indicated by 'type' or 'category'. - - name: priority - description: The urgency level of the memo, such as 'urgent', 'high', 'normal', etc. Look for 'priority' or 'urgency'. - - name: distribution_list - description: Additional people who receive copies of the memo. May be labeled as 'distribution', 'cc', or 'copy'. - - name: reference_number - description: An identifying number or code for the memo. Look for 'reference' or 'ref no'. - - name: department - description: The organizational unit issuing the memo. Often abbreviated as 'dept' or may appear as 'department' or 'division'. - - name: action_required - description: Steps that should be taken in response to the memo. Look for 'action', 'response needed', or 'next steps'. - - name: advertisement - description: A promotional material featuring product images, marketing text, calls-to-action, and branding elements - attributes: - - name: product_name - description: The name of the item or service being advertised. Look for prominently displayed text that could be a 'product', 'item', or 'service' name. - - name: brand - description: The company or manufacturer of the product. May be indicated by a logo or text labeled as 'brand', 'company', or 'manufacturer'. - - name: price - description: The cost of the product or service. Look for currency symbols or numbers followed by terms like 'price', 'cost', or 'special offer'. - - name: promotion_details - description: Information about special deals or discounts. May be introduced with 'promotion', 'offer', or 'deal'. - - name: validity_period - description: The timeframe during which the offer is valid. Look for phrases like 'valid until', 'offer ends', or 'expires'. - - name: contact_info - description: How to reach the advertiser. May include phone numbers, websites, or addresses following 'contact', 'call', or 'visit'. - - name: features - description: Notable qualities or benefits of the product. Often listed under 'features', 'benefits', or 'highlights'. - - name: terms_conditions - description: Legal constraints or limitations of the offer. Look for fine print labeled as 'terms', 'conditions', or 'restrictions'. - - name: call_to_action - description: What the advertisement encourages the reader to do. Often appears as imperative phrases like 'call now', 'visit today', or 'order now'. - - name: disclaimer - description: Legal statements limiting liability or making clarifications. Usually appears as fine print introduced by 'disclaimer' or phrases like 'terms apply' or 'conditions apply'. - - name: email - description: A digital message with email headers (To/From/Subject), timestamps, and conversational threading - attributes: - - name: from_address - description: The email address of the sender. Look for text following 'from', 'sender', or 'sent by', typically at the beginning of the email header. - - name: to_address - description: The email address of the primary recipient. May be labeled as 'to', 'recipient', or 'sent to'. - - name: cc_address - description: Email addresses of additional recipients who receive copies. Look for 'cc' or 'carbon copy' followed by one or more email addresses. - - name: bcc_address - description: Email addresses of hidden recipients. May be labeled as 'bcc' or 'blind copy'. - - name: subject - description: The topic of the email. Often preceded by 'subject', 'RE:', or 'regarding'. - - name: date_sent - description: The date and time when the email was sent. Look for 'date', 'sent on', or 'received', typically in the email header. - - name: attachments - description: Files included with the email. May be indicated by 'attached', 'attachment', or 'enclosed', often with icons or file names. - - name: priority - description: The urgency level of the email, such as 'high', 'normal', etc. Look for 'priority' or 'importance'. - - name: thread_id - description: An identifier for the email conversation. May be labeled as 'thread' or 'conversation', typically not visible to regular users. - - name: message_id - description: A unique identifier for the specific email. Look for 'message id' or 'email id', usually hidden in the email metadata. - - name: questionnaire - description: A survey instrument containing numbered questions with multiple choice, rating scales, or open-ended responses - attributes: - - name: form_title - description: The name or title of the questionnaire. Look for prominently displayed text at the beginning that could be a 'title', 'survey name', or 'questionnaire name'. - - name: respondent_info - description: Information about the person completing the questionnaire. May include fields labeled 'respondent', 'participant', or 'name'. - - name: submission_date - description: The date when the questionnaire was completed. Look for 'date', 'completed on', or 'submitted'. - - name: section_headers - description: Titles for different segments of the questionnaire. Often appear as bold or larger text introducing a new 'section', 'part', or 'segment'. - - name: question_types - description: The format of questions (multiple choice, free text, etc.). May be indicated by 'type', 'question format', or 'response format'. - - name: response_options - description: Possible answers for multiple-choice questions. Look for checkboxes, radio buttons, or dropdown menus with 'options', 'choices', or 'answers'. - - name: required_fields - description: Questions that must be answered to complete the questionnaire. Often marked with an asterisk (*) or explicitly labeled as 'required', 'mandatory', or 'must answer'. - - name: instructions - description: Guidance on how to complete the questionnaire. Look for text introduced by 'instructions', 'directions', or 'guidelines'. - - name: survey_id - description: A unique identifier for the questionnaire. May be labeled as 'survey id', 'reference number', or 'form id'. - - name: completion_status - description: Whether the questionnaire has been fully completed. Look for indicators of 'status', 'completion', or 'progress', often shown as a percentage or progress bar. - - name: Phone Call Representative Courtesy - description: Measures the perceived politeness and professionalism of the customer service representative during the phone interaction. Checkbox selection on the satisfaction scale with options like "Very Satisfied", "Somewhat Satisfied" etc. - attributes: [] - evaluation_method: '' - - name: Phone call representative knowledge rating - description: Measures the perceived knowledge level of the customer service representative during the phone interaction. Checkbox selection on the satisfaction scale with options like "Very Satisfied", "Somewhat Satisfied" etc. - attributes: [] - - name: Request handling satisfaction rating - description: Measures the customer's level of satisfaction with the way their request was handled. Checkbox selection on the satisfaction scale with options like "Very Satisfied", "Somewhat Satisfied" etc. - attributes: [] - - name: Overall Satisfaction rating - description: This rating BEST describes the way the customer feels about the representative's response to their request for asssistance. Checkbox selection on the satisfaction scale with options like "I was very satisfied", "I was somewhat satisfied" etc. - attributes: [] - - name: Future purchase intent - description: A measure of whether the user will continue to buy the product they contacted about. Checkbox selection with options like "I definitely would", "I probably would" etc. - attributes: [] - - name: Product recommendation intent - description: A measure of whether the caller is willing to recommend the product that they called about to others. Checkbox selection with options like "I definitely would", "I probably would" etc. - attributes: [] - - name: specification - description: A technical document detailing precise requirements, measurements, standards, and implementation criteria - attributes: - - name: product_name - description: The name of the item being specified. Look for text labeled as 'product', 'item', or 'model', typically appearing prominently at the beginning. - - name: version - description: The iteration or release number. May be indicated by 'version', 'revision', or 'release', often followed by a number or code. - - name: technical_details - description: Specific characteristics and capabilities. Look for sections labeled 'specifications', 'tech specs', or 'details', often presented in a detailed list. - - name: requirements - description: Necessary conditions or resources. May be introduced with 'requirements', 'prerequisites', or 'needed'. - - name: compatibility - description: What the product can work with. Look for text following 'compatible with', 'works with', or 'supports'. - - name: dimensions - description: Physical measurements of the product. Often labeled as 'dimensions', 'size', or 'measurements', usually including length, width, height, etc. - - name: materials - description: What the product is made from. May be indicated by 'materials', 'composition', or phrases like 'made from'. - - name: standards - description: Industry guidelines or certifications met. Look for references to 'standards', 'certifications', or 'compliance'. - - name: revision_history - description: Record of changes to the specification. Often labeled as 'revisions', 'changes', or 'updates', typically in a table format. - - name: approval_info - description: Details about who has validated the specification. May be indicated by phrases like 'approved by', 'certified by', or 'validated'. - - name: generic - description: An unstructured document lacking distinctive formatting or purpose-specific elements of other categories - attributes: - - name: document_type - description: The classification or category of the document. Look for terms like 'type', 'category', or 'class' that indicate what kind of document this is. - - name: document_date - description: The date when the document was created. May be labeled as 'date', 'created on', or 'issued on'. - - name: document_id - description: A unique identifier for the document. Look for 'id', 'reference', or 'number', typically appearing near the top of the document. - - name: title - description: The name or heading of the document. Often appears prominently at the beginning, may be labeled as 'title', 'heading', or 'subject'. - - name: author - description: The person who created the document. Look for 'author', 'creator', or 'sender'. - - name: recipient - description: The person for whom the document is intended. May be indicated by 'recipient', 'to', or 'addressee'. - - name: content_summary - description: A brief description of the document's contents. Look for 'summary', 'abstract', or 'overview', typically appearing early in the document. - - name: status - description: The current state of the document, such as 'draft', 'final', 'pending', etc. May be labeled as 'status', 'state', or 'condition'. - - name: department - description: The organizational unit associated with the document. Often abbreviated as 'dept' or may appear as 'department' or 'division'. - - name: comments - description: Additional notes or remarks about the document. Look for sections labeled 'notes', 'remarks', or 'comments'. + invoice_date: + type: string + description: >- + The date when the invoice was issued. May be labeled as 'date', + 'invoice date', or 'billing date'. + due_date: + type: string + description: >- + The deadline by which payment must be made. Look for 'due date', + 'payment due', or 'payable by'. + vendor_name: + type: string + description: >- + The name of the business providing goods or services. May be labeled + as 'vendor', 'seller', 'supplier', or simply appear prominently at the + top of the invoice. + vendor_address: + type: string + description: >- + The physical location of the vendor. Look for 'address', 'location', + or 'business address', typically near the vendor name. + customer_name: + type: string + description: >- + The name of the person or entity being billed. Often preceded by + 'customer', 'buyer', or 'bill to'. + customer_address: + type: string + description: >- + The address where the invoice is sent or goods are delivered. May be + labeled as 'billing address' or 'ship to'. + items: + type: string + description: >- + Descriptions of the products or services provided. Look for a section + with 'description', 'item details', or 'products', usually in a table + format. + quantities: + type: string + description: >- + The number of each item provided. Often abbreviated as 'qty' or may + appear as 'quantity' or 'amount' in a table column. + unit_prices: + type: string + description: >- + The cost per unit of each item. May be labeled as 'price', 'rate', or + 'unit cost'. + subtotal: + type: string + description: >- + The sum of all items before tax and other charges. Look for 'subtotal' + or 'net amount', typically found toward the bottom of the invoice. + tax: + type: string + description: >- + The amount of tax charged on the invoice. May be labeled as 'tax', + 'VAT', or 'GST', usually appearing after the subtotal. + total_amount: + type: string + description: >- + The final amount to be paid including all charges. Look for 'total', + 'grand total', or 'amount due', typically the last figure on the + invoice. + payment_terms: + type: string + description: >- + The conditions under which payment should be made, such as '30 days', + 'COD', etc. Often labeled as 'terms', 'payment terms', or + 'conditions'. + po_number: + type: string + description: >- + The purchase order reference number. May be abbreviated as 'PO' or + appear as 'purchase order' or 'order reference'. + - $schema: https://json-schema.org/draft/2020-12/schema + $id: resume + x-aws-idp-document-type: resume + type: object + description: >- + A professional summary showcasing work experience, education, skills, and + achievements for job applications + properties: + full_name: + type: string + description: >- + The complete name of the job applicant, typically appearing + prominently at the top of the resume. May be simply labeled as 'name' + or 'applicant name'. + contact_info: + type: string + description: >- + The phone number, email, and address of the applicant. Look for a + section with 'contact', 'phone', 'email', or 'address', usually near + the top of the resume. + objective: + type: string + description: >- + A statement outlining the applicant's career goals. May be labeled as + 'objective', 'summary', or 'profile', typically appearing early in the + resume. + education: + type: string + description: >- + The academic history and qualifications of the applicant. Look for a + section with 'education', 'academic background', or 'qualifications'. + experience: + type: string + description: >- + The work history and previous roles of the applicant. Often labeled as + 'experience', 'work history', or 'employment'. + skills: + type: string + description: >- + The abilities and competencies of the applicant. Look for a section + titled 'skills', 'competencies', or 'expertise'. + certifications: + type: string + description: >- + Professional credentials and qualifications. May be labeled as + 'certifications', 'certificates', or 'credentials'. + languages: + type: string + description: >- + Languages known and level of proficiency. Often appears in a section + labeled 'languages' or 'language proficiency'. + references: + type: string + description: >- + People who can vouch for the applicant's abilities. Look for + 'references' or 'referees', typically at the end of the resume. + achievements: + type: string + description: >- + Notable accomplishments and recognition. May be labeled as + 'achievements', 'accomplishments', or 'awards'. + - $schema: https://json-schema.org/draft/2020-12/schema + $id: scientific_publication + x-aws-idp-document-type: scientific_publication + type: object + description: >- + A peer-reviewed academic document with abstract, methodology, results, + citations, and research findings + properties: + title: + type: string + description: >- + The name of the scientific paper, typically appearing prominently at + the beginning. May be labeled as 'title', 'paper title', or 'article + title'. + authors: + type: string + description: >- + The researchers who conducted the study and wrote the paper. Look for + names after 'authors', 'contributors', or 'researchers', usually + following the title. + abstract: + type: string + description: >- + A brief summary of the paper's content. Often labeled as 'abstract' or + 'summary', appearing before the main text. + keywords: + type: string + description: >- + Terms that represent the core topics of the paper. Look for a list + labeled 'keywords' or 'key terms', typically after the abstract. + publication_date: + type: string + description: >- + The date when the paper was published. May be preceded by 'published' + or labeled as 'publication date'. + journal_name: + type: string + description: >- + The name of the journal where the paper was published. Look for text + following 'journal' or 'publication'. + volume: + type: string + description: >- + The volume number of the journal. Often abbreviated as 'vol' or may + appear as 'volume'. + issue: + type: string + description: >- + The issue number of the journal. May be labeled as 'issue' or + abbreviated as 'no'. + pages: + type: string + description: >- + The page range of the paper in the journal. Often abbreviated as 'pp' + or may appear as 'pages'. + doi: + type: string + description: >- + The Digital Object Identifier for the paper, a unique alphanumeric + string. Look for 'DOI' or 'digital object identifier'. + funding: + type: string + description: >- + Financial support received for the research. May be indicated by + 'funding', 'grants', or 'financial support'. + corresponding_author: + type: string + description: >- + The author responsible for communication regarding the paper. Look for + 'corresponding author' or 'contact author'. + institutions: + type: string + description: >- + The organizations with which the authors are affiliated. May be + labeled as 'affiliations' or 'institutions'. + - $schema: https://json-schema.org/draft/2020-12/schema + $id: memo + x-aws-idp-document-type: memo + type: object + description: >- + An internal business communication with TO/FROM/DATE/SUBJECT headers for + organizational announcements or directives + properties: + memo_date: + type: string + description: >- + The date when the memo was written. Look for 'date' or 'memo date', + typically near the top of the document. + from: + type: string + description: >- + The person or department that wrote the memo. May be labeled as + 'from', 'sender', or 'author'. + to: + type: string + description: >- + The intended recipient of the memo. Look for text after 'to', + 'recipient', or 'addressee'. + subject: + type: string + description: >- + The topic of the memo. Often preceded by 'subject', 'RE:', or + 'regarding'. + memo_type: + type: string + description: >- + The category or classification of the memo, such as 'informational', + 'directive', etc. May be indicated by 'type' or 'category'. + priority: + type: string + description: >- + The urgency level of the memo, such as 'urgent', 'high', 'normal', + etc. Look for 'priority' or 'urgency'. + distribution_list: + type: string + description: >- + Additional people who receive copies of the memo. May be labeled as + 'distribution', 'cc', or 'copy'. + reference_number: + type: string + description: >- + An identifying number or code for the memo. Look for 'reference' or + 'ref no'. + department: + type: string + description: >- + The organizational unit issuing the memo. Often abbreviated as 'dept' + or may appear as 'department' or 'division'. + action_required: + type: string + description: >- + Steps that should be taken in response to the memo. Look for 'action', + 'response needed', or 'next steps'. + - $schema: https://json-schema.org/draft/2020-12/schema + $id: advertisement + x-aws-idp-document-type: advertisement + type: object + description: >- + A promotional material featuring product images, marketing text, + calls-to-action, and branding elements + properties: + product_name: + type: string + description: >- + The name of the item or service being advertised. Look for prominently + displayed text that could be a 'product', 'item', or 'service' name. + brand: + type: string + description: >- + The company or manufacturer of the product. May be indicated by a logo + or text labeled as 'brand', 'company', or 'manufacturer'. + price: + type: string + description: >- + The cost of the product or service. Look for currency symbols or + numbers followed by terms like 'price', 'cost', or 'special offer'. + promotion_details: + type: string + description: >- + Information about special deals or discounts. May be introduced with + 'promotion', 'offer', or 'deal'. + validity_period: + type: string + description: >- + The timeframe during which the offer is valid. Look for phrases like + 'valid until', 'offer ends', or 'expires'. + contact_info: + type: string + description: >- + How to reach the advertiser. May include phone numbers, websites, or + addresses following 'contact', 'call', or 'visit'. + features: + type: string + description: >- + Notable qualities or benefits of the product. Often listed under + 'features', 'benefits', or 'highlights'. + terms_conditions: + type: string + description: >- + Legal constraints or limitations of the offer. Look for fine print + labeled as 'terms', 'conditions', or 'restrictions'. + call_to_action: + type: string + description: >- + What the advertisement encourages the reader to do. Often appears as + imperative phrases like 'call now', 'visit today', or 'order now'. + disclaimer: + type: string + description: >- + Legal statements limiting liability or making clarifications. Usually + appears as fine print introduced by 'disclaimer' or phrases like + 'terms apply' or 'conditions apply'. + - $schema: https://json-schema.org/draft/2020-12/schema + $id: email + x-aws-idp-document-type: email + type: object + description: >- + A digital message with email headers (To/From/Subject), timestamps, and + conversational threading + properties: + from_address: + type: string + description: >- + The email address of the sender. Look for text following 'from', + 'sender', or 'sent by', typically at the beginning of the email + header. + to_address: + type: string + description: >- + The email address of the primary recipient. May be labeled as 'to', + 'recipient', or 'sent to'. + cc_address: + type: string + description: >- + Email addresses of additional recipients who receive copies. Look for + 'cc' or 'carbon copy' followed by one or more email addresses. + bcc_address: + type: string + description: >- + Email addresses of hidden recipients. May be labeled as 'bcc' or + 'blind copy'. + subject: + type: string + description: >- + The topic of the email. Often preceded by 'subject', 'RE:', or + 'regarding'. + date_sent: + type: string + description: >- + The date and time when the email was sent. Look for 'date', 'sent on', + or 'received', typically in the email header. + attachments: + type: string + description: >- + Files included with the email. May be indicated by 'attached', + 'attachment', or 'enclosed', often with icons or file names. + priority: + type: string + description: >- + The urgency level of the email, such as 'high', 'normal', etc. Look + for 'priority' or 'importance'. + thread_id: + type: string + description: >- + An identifier for the email conversation. May be labeled as 'thread' + or 'conversation', typically not visible to regular users. + message_id: + type: string + description: >- + A unique identifier for the specific email. Look for 'message id' or + 'email id', usually hidden in the email metadata. + - $schema: https://json-schema.org/draft/2020-12/schema + $id: questionnaire + x-aws-idp-document-type: questionnaire + type: object + description: >- + A survey instrument containing numbered questions with multiple choice, + rating scales, or open-ended responses + properties: + form_title: + type: string + description: >- + The name or title of the questionnaire. Look for prominently displayed + text at the beginning that could be a 'title', 'survey name', or + 'questionnaire name'. + respondent_info: + type: string + description: >- + Information about the person completing the questionnaire. May include + fields labeled 'respondent', 'participant', or 'name'. + submission_date: + type: string + description: >- + The date when the questionnaire was completed. Look for 'date', + 'completed on', or 'submitted'. + section_headers: + type: string + description: >- + Titles for different segments of the questionnaire. Often appear as + bold or larger text introducing a new 'section', 'part', or 'segment'. + question_types: + type: string + description: >- + The format of questions (multiple choice, free text, etc.). May be + indicated by 'type', 'question format', or 'response format'. + response_options: + type: string + description: >- + Possible answers for multiple-choice questions. Look for checkboxes, + radio buttons, or dropdown menus with 'options', 'choices', or + 'answers'. + required_fields: + type: string + description: >- + Questions that must be answered to complete the questionnaire. Often + marked with an asterisk (*) or explicitly labeled as 'required', + 'mandatory', or 'must answer'. + instructions: + type: string + description: >- + Guidance on how to complete the questionnaire. Look for text + introduced by 'instructions', 'directions', or 'guidelines'. + survey_id: + type: string + description: >- + A unique identifier for the questionnaire. May be labeled as 'survey + id', 'reference number', or 'form id'. + completion_status: + type: string + description: >- + Whether the questionnaire has been fully completed. Look for + indicators of 'status', 'completion', or 'progress', often shown as a + percentage or progress bar. + Phone Call Representative Courtesy: + type: string + description: >- + Measures the perceived politeness and professionalism of the customer + service representative during the phone interaction. Checkbox + selection on the satisfaction scale with options like "Very + Satisfied", "Somewhat Satisfied" etc. + x-aws-idp-evaluation-method: "" + Phone call representative knowledge rating: + type: string + description: >- + Measures the perceived knowledge level of the customer service + representative during the phone interaction. Checkbox selection on the + satisfaction scale with options like "Very Satisfied", "Somewhat + Satisfied" etc. + Request handling satisfaction rating: + type: string + description: >- + Measures the customer's level of satisfaction with the way their + request was handled. Checkbox selection on the satisfaction scale with + options like "Very Satisfied", "Somewhat Satisfied" etc. + Overall Satisfaction rating: + type: string + description: >- + This rating BEST describes the way the customer feels about the + representative's response to their request for asssistance. Checkbox + selection on the satisfaction scale with options like "I was very + satisfied", "I was somewhat satisfied" etc. + Future purchase intent: + type: string + description: >- + A measure of whether the user will continue to buy the product they + contacted about. Checkbox selection with options like "I definitely + would", "I probably would" etc. + Product recommendation intent: + type: string + description: >- + A measure of whether the caller is willing to recommend the product + that they called about to others. Checkbox selection with options like + "I definitely would", "I probably would" etc. + - $schema: https://json-schema.org/draft/2020-12/schema + $id: specification + x-aws-idp-document-type: specification + type: object + description: >- + A technical document detailing precise requirements, measurements, + standards, and implementation criteria + properties: + product_name: + type: string + description: >- + The name of the item being specified. Look for text labeled as + 'product', 'item', or 'model', typically appearing prominently at the + beginning. + version: + type: string + description: >- + The iteration or release number. May be indicated by 'version', + 'revision', or 'release', often followed by a number or code. + technical_details: + type: string + description: >- + Specific characteristics and capabilities. Look for sections labeled + 'specifications', 'tech specs', or 'details', often presented in a + detailed list. + requirements: + type: string + description: >- + Necessary conditions or resources. May be introduced with + 'requirements', 'prerequisites', or 'needed'. + compatibility: + type: string + description: >- + What the product can work with. Look for text following 'compatible + with', 'works with', or 'supports'. + dimensions: + type: string + description: >- + Physical measurements of the product. Often labeled as 'dimensions', + 'size', or 'measurements', usually including length, width, height, + etc. + materials: + type: string + description: >- + What the product is made from. May be indicated by 'materials', + 'composition', or phrases like 'made from'. + standards: + type: string + description: >- + Industry guidelines or certifications met. Look for references to + 'standards', 'certifications', or 'compliance'. + revision_history: + type: string + description: >- + Record of changes to the specification. Often labeled as 'revisions', + 'changes', or 'updates', typically in a table format. + approval_info: + type: string + description: >- + Details about who has validated the specification. May be indicated by + phrases like 'approved by', 'certified by', or 'validated'. + - $schema: https://json-schema.org/draft/2020-12/schema + $id: generic + x-aws-idp-document-type: generic + type: object + description: >- + An unstructured document lacking distinctive formatting or + purpose-specific elements of other categories + properties: + document_type: + type: string + description: >- + The classification or category of the document. Look for terms like + 'type', 'category', or 'class' that indicate what kind of document + this is. + document_date: + type: string + description: >- + The date when the document was created. May be labeled as 'date', + 'created on', or 'issued on'. + document_id: + type: string + description: >- + A unique identifier for the document. Look for 'id', 'reference', or + 'number', typically appearing near the top of the document. + title: + type: string + description: >- + The name or heading of the document. Often appears prominently at the + beginning, may be labeled as 'title', 'heading', or 'subject'. + author: + type: string + description: >- + The person who created the document. Look for 'author', 'creator', or + 'sender'. + recipient: + type: string + description: >- + The person for whom the document is intended. May be indicated by + 'recipient', 'to', or 'addressee'. + content_summary: + type: string + description: >- + A brief description of the document's contents. Look for 'summary', + 'abstract', or 'overview', typically appearing early in the document. + status: + type: string + description: >- + The current state of the document, such as 'draft', 'final', + 'pending', etc. May be labeled as 'status', 'state', or 'condition'. + department: + type: string + description: >- + The organizational unit associated with the document. Often + abbreviated as 'dept' or may appear as 'department' or 'division'. + comments: + type: string + description: >- + Additional notes or remarks about the document. Look for sections + labeled 'notes', 'remarks', or 'comments'. classification: model: Custom fine tuned UDOP model extraction: image: - target_width: '' - target_height: '' - top_p: '0.1' - max_tokens: '10000' - top_k: '5' + target_width: "" + target_height: "" + top_p: "0.1" + max_tokens: "10000" + top_k: "5" task_prompt: >- - + You are an expert in document analysis and information extraction. You can understand and extract key information from documents classified as type @@ -381,7 +837,7 @@ extraction: {DOCUMENT_TEXT} - + @@ -402,15 +858,15 @@ extraction: 7. Think step by step before finalizing your answer - temperature: '0.0' + temperature: "0.0" model: us.amazon.nova-pro-v1:0 system_prompt: >- You are a document assistant. Respond only with JSON. Never make up data, only provide data found in the document being provided. summarization: enabled: true - top_p: '0.1' - max_tokens: '4096' - top_k: '5' + top_p: "0.1" + max_tokens: "4096" + top_k: "5" task_prompt: >- @@ -464,29 +920,29 @@ summarization: Do not include any text, explanations, or notes outside of this JSON structure. The JSON must be properly formatted and parseable. - temperature: '0.0' + temperature: "0.0" model: us.anthropic.claude-3-7-sonnet-20250219-v1:0 system_prompt: >- You are a document summarization expert who can analyze and summarize documents from various domains including medical, financial, legal, and general business documents. Your task is to create a summary that captures the key information, main points, and important details from the document. Your output must be in valid JSON format. \nSummarization Style: Balanced\\nCreate a balanced summary that provides a moderate level of detail. Include the main points and key supporting information, while maintaining the document's overall structure. Aim for a comprehensive yet concise summary.\n Your output MUST be in valid JSON format with markdown content. You MUST strictly adhere to the output format specified in the instructions. assessment: enabled: true image: - target_height: '' - target_width: '' + target_height: "" + target_width: "" granular: enabled: true max_workers: "20" simple_batch_size: "3" list_batch_size: "1" - default_confidence_threshold: '0.8' - top_p: '0.1' - max_tokens: '10000' - top_k: '5' - temperature: '0.0' + default_confidence_threshold: "0.8" + top_p: "0.1" + max_tokens: "10000" + top_k: "5" + temperature: "0.0" model: us.amazon.nova-lite-v1:0 system_prompt: >- You are a document analysis assessment expert. Your role is to evaluate the confidence and accuracy of data extraction results by analyzing them against source documents. - + Provide accurate confidence scores for each assessment. When bounding boxes are requested, provide precise coordinate locations where information appears in the document. task_prompt: >- @@ -528,7 +984,7 @@ assessment: For each field, provide bounding box coordinates: - bbox: [x1, y1, x2, y2] coordinates in normalized 0-1000 scale - page: Page number where the field appears (starting from 1) - + Coordinate system: - Use normalized scale 0-1000 for both x and y axes - x1, y1 = top-left corner of bounding box @@ -627,9 +1083,9 @@ assessment: evaluation: enabled: true llm_method: - top_p: '0.1' - max_tokens: '4096' - top_k: '5' + top_p: "0.1" + max_tokens: "4096" + top_k: "5" task_prompt: >- I need to evaluate attribute extraction for a document of class: {DOCUMENT_CLASS}. @@ -659,7 +1115,7 @@ evaluation: "score": 0.0 to 1.0, "reason": "Your explanation here" } - temperature: '0.0' + temperature: "0.0" model: us.anthropic.claude-3-haiku-20240307-v1:0 system_prompt: >- You are an evaluator that helps determine if the predicted and expected values match for document attribute extraction. You will consider the context and meaning rather than just exact string matching. @@ -709,8 +1165,8 @@ discovery: ] } with_ground_truth: - top_p: '0.1' - temperature: '1.0' + top_p: "0.1" + temperature: "1.0" user_prompt: >- This image contains unstructured data. Analyze the data line by line using the provided ground truth as reference. @@ -735,10 +1191,10 @@ discovery: documents. Use provided ground truth data as reference to optimize field extraction and ensure consistency with expected document structure and field definitions. - max_tokens: '10000' + max_tokens: "10000" without_ground_truth: - top_p: '0.1' - temperature: '1.0' + top_p: "0.1" + temperature: "1.0" user_prompt: >- This image contains forms data. Analyze the form line by line. Image may contains multiple pages, process all the pages. @@ -764,7 +1220,7 @@ discovery: documents. Analyze forms line by line to identify field names, data types, and organizational structure. Focus on creating comprehensive blueprints for document processing without extracting actual values. - max_tokens: '10000' + max_tokens: "10000" agents: error_analyzer: model_id: us.anthropic.claude-sonnet-4-20250514-v1:0 @@ -778,15 +1234,15 @@ agents: 3. Collect relevant logs from CloudWatch 4. Identify any performance issues from X-Ray traces 5. Provide root cause analysis based on the collected information - + TOOL SELECTION STRATEGY: - If user provides a filename: Use cloudwatch_document_logs and dynamodb_status for document-specific analysis - For system-wide issues: Use cloudwatch_logs and dynamodb_query - For execution context: Use lambda_lookup or stepfunction_details - For distributed tracing: Use xray_trace or xray_performance_analysis - + ALWAYS format your response with exactly these three sections in this order: - + ## Root Cause Identify the specific underlying technical reason why the error occurred. Focus on the primary cause, not symptoms. @@ -795,16 +1251,16 @@ agents:
Evidence - + Format evidence with source information. Include relevant data from tool responses: - + **For CloudWatch logs:** **Log Group:** [full log_group name] **Log Stream:** [full log_stream name] ``` [ERROR] timestamp message ``` - + **For other sources (DynamoDB, Step Functions, X-Ray):** **Source:** [service name and resource] ``` @@ -826,14 +1282,14 @@ agents: - Use system-wide tools for pattern analysis - Combine DynamoDB status with CloudWatch logs for complete picture - Leverage X-Ray for distributed system issues - + ROOT CAUSE DETERMINATION: 1. Document Status: Check dynamodb_status first 2. Execution Details: Use stepfunction_details for workflow failures 3. Log Analysis: Use cloudwatch_document_logs or cloudwatch_logs for error details 4. Distributed Tracing: Use xray_performance_analysis for service interaction issues 5. Context: Use lambda_lookup for execution environment - + RECOMMENDATION GUIDELINES: For code-related issues or system bugs: - Do not suggest code modifications @@ -852,7 +1308,7 @@ agents: - last week: 168 hours - last day: 24 hours - No time specified: 24 hours (default) - + IMPORTANT: Do not include any search quality reflections, search quality scores, or meta-analysis sections in your response. Only provide the three required sections: Root Cause, Recommendations, and Evidence. parameters: max_log_events: 5 @@ -861,252 +1317,252 @@ pricing: - name: textract/detect_document_text units: - name: pages - price: '0.0015' + price: "0.0015" - name: textract/analyze_document-Layout units: - name: pages - price: '0.004' + price: "0.004" - name: textract/analyze_document-Signatures units: - name: pages - price: '0.0035' + price: "0.0035" - name: textract/analyze_document-Forms units: - name: pages - price: '0.05' + price: "0.05" - name: textract/analyze_document-Tables units: - name: pages - price: '0.015' + price: "0.015" - name: textract/analyze_document-Tables+Forms units: - name: pages - price: '0.065' + price: "0.065" - name: bedrock/us.amazon.nova-lite-v1:0 units: - name: inputTokens - price: '6.0E-8' + price: "6.0E-8" - name: outputTokens - price: '2.4E-7' + price: "2.4E-7" - name: cacheReadInputTokens - price: '1.5E-8' + price: "1.5E-8" - name: cacheWriteInputTokens - price: '6.0E-8' + price: "6.0E-8" - name: bedrock/us.amazon.nova-pro-v1:0 units: - name: inputTokens - price: '8.0E-7' + price: "8.0E-7" - name: outputTokens - price: '3.2E-6' + price: "3.2E-6" - name: cacheReadInputTokens - price: '2.0E-7' + price: "2.0E-7" - name: cacheWriteInputTokens - price: '8.0E-7' + price: "8.0E-7" - name: bedrock/us.amazon.nova-premier-v1:0 units: - name: inputTokens - price: '2.5E-6' + price: "2.5E-6" - name: outputTokens - price: '1.25E-5' + price: "1.25E-5" - name: bedrock/us.anthropic.claude-3-haiku-20240307-v1:0 units: - name: inputTokens - price: '2.5E-7' + price: "2.5E-7" - name: outputTokens - price: '1.25E-6' + price: "1.25E-6" - name: bedrock/us.anthropic.claude-3-5-haiku-20241022-v1:0 units: - name: inputTokens - price: '8.0E-7' + price: "8.0E-7" - name: outputTokens - price: '4.0E-6' + price: "4.0E-6" - name: cacheReadInputTokens - price: '8.0E-8' + price: "8.0E-8" - name: cacheWriteInputTokens - price: '1.0E-6' + price: "1.0E-6" - name: bedrock/us.anthropic.claude-haiku-4-5-20251001-v1:0 units: - name: inputTokens - price: '1.1E-06' + price: "1.1E-06" - name: outputTokens - price: '5.5E-06' + price: "5.5E-06" - name: cacheReadInputTokens - price: '1.1E-07' + price: "1.1E-07" - name: cacheWriteInputTokens - price: '1.4E-06' + price: "1.4E-06" - name: bedrock/us.anthropic.claude-3-5-sonnet-20241022-v2:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/us.anthropic.claude-3-7-sonnet-20250219-v1:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/us.anthropic.claude-sonnet-4-20250514-v1:0:1m units: - name: inputTokens - price: '6.0E-6' + price: "6.0E-6" - name: outputTokens - price: '2.25E-5' + price: "2.25E-5" - name: cacheReadInputTokens - price: '6.0E-7' + price: "6.0E-7" - name: cacheWriteInputTokens - price: '7.5E-6' + price: "7.5E-6" - name: bedrock/us.anthropic.claude-sonnet-4-5-20250929-v1:0 units: - name: inputTokens - price: '3.3E-6' + price: "3.3E-6" - name: outputTokens - price: '1.65E-5' + price: "1.65E-5" - name: cacheReadInputTokens - price: '3.3E-7' + price: "3.3E-7" - name: cacheWriteInputTokens - price: '4.125E-6' + price: "4.125E-6" - name: bedrock/us.anthropic.claude-sonnet-4-5-20250929-v1:0:1m units: - name: inputTokens - price: '6.6E-6' + price: "6.6E-6" - name: outputTokens - price: '2.475E-5' + price: "2.475E-5" - name: cacheReadInputTokens - price: '6.6E-7' + price: "6.6E-7" - name: cacheWriteInputTokens - price: '8.25E-6' + price: "8.25E-6" - name: bedrock/us.anthropic.claude-opus-4-20250514-v1:0 units: - name: inputTokens - price: '1.5E-5' + price: "1.5E-5" - name: outputTokens - price: '7.5E-5' + price: "7.5E-5" - name: cacheReadInputTokens - price: '1.5E-6' + price: "1.5E-6" - name: cacheWriteInputTokens - price: '1.875E-5' + price: "1.875E-5" - name: bedrock/us.anthropic.claude-opus-4-1-20250805-v1:0 units: - name: inputTokens - price: '1.5E-5' + price: "1.5E-5" - name: outputTokens - price: '7.5E-5' + price: "7.5E-5" - name: cacheReadInputTokens - price: '1.5E-6' + price: "1.5E-6" - name: cacheWriteInputTokens - price: '1.875E-5' + price: "1.875E-5" # EU model pricing - name: bedrock/eu.amazon.nova-lite-v1:0 units: - name: inputTokens - price: '7.8E-8' + price: "7.8E-8" - name: outputTokens - price: '3.1E-7' + price: "3.1E-7" - name: cacheReadInputTokens - price: '1.9E-8' + price: "1.9E-8" - name: cacheWriteInputTokens - price: '7.8E-8' + price: "7.8E-8" - name: bedrock/eu.amazon.nova-pro-v1:0 units: - name: inputTokens - price: '1.0E-6' + price: "1.0E-6" - name: outputTokens - price: '4.2E-6' + price: "4.2E-6" - name: cacheReadInputTokens - price: '2.6E-7' + price: "2.6E-7" - name: cacheWriteInputTokens - price: '1.0E-6' + price: "1.0E-6" - name: bedrock/eu.anthropic.claude-3-haiku-20240307-v1:0 units: - name: inputTokens - price: '2.5E-7' + price: "2.5E-7" - name: outputTokens - price: '1.25E-6' + price: "1.25E-6" - name: bedrock/eu.anthropic.claude-haiku-4-5-20251001-v1:0 units: - name: inputTokens - price: '1.1E-6' + price: "1.1E-6" - name: outputTokens - price: '5.5E-6' + price: "5.5E-6" - name: cacheReadInputTokens - price: '1.1E-7' + price: "1.1E-7" - name: cacheWriteInputTokens - price: '1.4E-6' + price: "1.4E-6" - name: bedrock/eu.anthropic.claude-3-5-sonnet-20241022-v2:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/eu.anthropic.claude-3-7-sonnet-20250219-v1:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/eu.anthropic.claude-sonnet-4-20250514-v1:0 units: - name: inputTokens - price: '3.0E-6' + price: "3.0E-6" - name: outputTokens - price: '1.5E-5' + price: "1.5E-5" - name: cacheReadInputTokens - price: '3.0E-7' + price: "3.0E-7" - name: cacheWriteInputTokens - price: '3.75E-6' + price: "3.75E-6" - name: bedrock/eu.anthropic.claude-sonnet-4-5-20250929-v1:0 units: - name: inputTokens - price: '3.3E-6' + price: "3.3E-6" - name: outputTokens - price: '1.65E-5' + price: "1.65E-5" - name: cacheReadInputTokens - price: '3.3E-7' + price: "3.3E-7" - name: cacheWriteInputTokens - price: '4.125E-6' + price: "4.125E-6" - name: bedrock/eu.anthropic.claude-sonnet-4-5-20250929-v1:0:1m units: - name: inputTokens - price: '6.6E-6' + price: "6.6E-6" - name: outputTokens - price: '2.475E-5' + price: "2.475E-5" - name: cacheReadInputTokens - price: '6.6E-7' + price: "6.6E-7" - name: cacheWriteInputTokens - price: '8.25E-6' + price: "8.25E-6" # AWS Lambda pricing (US East - N. Virginia) - name: lambda/requests units: - name: invocations - price: '2.0E-7' # $0.0000002 per request ($0.20 per 1M requests) - - name: lambda/duration + price: "2.0E-7" # $0.0000002 per request ($0.20 per 1M requests) + - name: lambda/duration units: - name: gb_seconds - price: '1.66667E-5' # $0.0000166667 per GB-second ($16.67 per 1M GB-seconds) + price: "1.66667E-5" # $0.0000166667 per GB-second ($16.67 per 1M GB-seconds) diff --git a/docs/assessment.md b/docs/assessment.md index 5b521138a..02ee73690 100644 --- a/docs/assessment.md +++ b/docs/assessment.md @@ -318,9 +318,9 @@ For basic single-value extractions like dates, amounts, or names: **Configuration:** ```yaml -attributes: - - name: "StatementDate" - attributeType: "simple" +properties: + StatementDate: + type: string description: "The date of the bank statement" ``` @@ -360,14 +360,16 @@ For nested object structures with multiple related fields: **Configuration:** ```yaml -attributes: - - name: "AccountDetails" - attributeType: "group" +properties: + AccountDetails: + type: object description: "Bank account information" - groupAttributes: - - name: "AccountNumber" + properties: + AccountNumber: + type: string description: "The account number" - - name: "RoutingNumber" + RoutingNumber: + type: string description: "The bank routing number" ``` @@ -413,18 +415,22 @@ For arrays of items, such as transactions in a bank statement: **Configuration:** ```yaml -attributes: - - name: "Transactions" - attributeType: "list" +properties: + Transactions: + type: array description: "List of all transactions on the statement" - listItemTemplate: - itemDescription: "Individual transaction entry" - itemAttributes: - - name: "Date" + x-aws-idp-list-item-description: "Individual transaction entry" + items: + type: object + properties: + Date: + type: string description: "Transaction date" - - name: "Description" + Description: + type: string description: "Transaction description" - - name: "Amount" + Amount: + type: string description: "Transaction amount" ``` @@ -979,27 +985,34 @@ attributes: Processes complex nested structures as single units: ```yaml # Each group becomes one focused task -attributes: - - name: "AccountDetails" - attributeType: "group" - groupAttributes: - - name: "AccountNumber" - - name: "RoutingNumber" - - name: "AccountType" +properties: + AccountDetails: + type: object + properties: + AccountNumber: + type: string + RoutingNumber: + type: string + AccountType: + type: string ``` #### List Item Tasks Assesses each list item individually for maximum accuracy: ```yaml # 100 transactions = 100 individual assessment tasks -attributes: - - name: "Transactions" - attributeType: "list" - listItemTemplate: - itemAttributes: - - name: "Date" - - name: "Description" - - name: "Amount" +properties: + Transactions: + type: array + items: + type: object + properties: + Date: + type: string + Description: + type: string + Amount: + type: string ``` ### Performance Tuning diff --git a/docs/classification.md b/docs/classification.md index 3ab869ed7..bb112f726 100644 --- a/docs/classification.md +++ b/docs/classification.md @@ -602,13 +602,16 @@ When you want all pages of a document to be classified as the same class, you ca ```yaml classes: - - name: Payslip + - $schema: "https://json-schema.org/draft/2020-12/schema" + $id: Payslip + x-aws-idp-document-type: Payslip + type: object description: "Employee wage statement showing earnings and deductions" - document_name_regex: "(?i).*(payslip|paystub|salary|wage).*" - attributes: - - name: EmployeeName + x-aws-idp-document-name-regex: "(?i).*(payslip|paystub|salary|wage).*" + properties: + EmployeeName: + type: string description: "Name of the employee" - attributeType: simple ``` **Benefits:** @@ -632,24 +635,33 @@ classification: classificationMethod: multimodalPageLevelClassification classes: - - name: Invoice + - $schema: "https://json-schema.org/draft/2020-12/schema" + $id: Invoice + x-aws-idp-document-type: Invoice + type: object description: "Business invoice document" - document_page_content_regex: "(?i)(invoice\\s+number|bill\\s+to|amount\\s+due)" - attributes: - - name: InvoiceNumber + x-aws-idp-document-page-content-regex: "(?i)(invoice\\s+number|bill\\s+to|amount\\s+due)" + properties: + InvoiceNumber: + type: string description: "Invoice number" - attributeType: simple - - name: Payslip + - $schema: "https://json-schema.org/draft/2020-12/schema" + $id: Payslip + x-aws-idp-document-type: Payslip + type: object description: "Employee wage statement" - document_page_content_regex: "(?i)(gross\\s+pay|net\\s+pay|employee\\s+id)" - attributes: - - name: EmployeeName + x-aws-idp-document-page-content-regex: "(?i)(gross\\s+pay|net\\s+pay|employee\\s+id)" + properties: + EmployeeName: + type: string description: "Employee name" - attributeType: simple - - name: Other + - $schema: "https://json-schema.org/draft/2020-12/schema" + $id: Other + x-aws-idp-document-type: Other + type: object description: "Documents that don't match specific patterns" # No regex - will always use LLM - attributes: [] + properties: {} ``` **Benefits:** diff --git a/docs/evaluation.md b/docs/evaluation.md index 28011ce8d..01ef96389 100644 --- a/docs/evaluation.md +++ b/docs/evaluation.md @@ -121,19 +121,24 @@ Basic single-value extractions evaluated as individual fields: ```yaml classes: - - name: invoice - attributes: - - name: invoice_number + - $schema: "https://json-schema.org/draft/2020-12/schema" + $id: invoice + x-aws-idp-document-type: invoice + type: object + properties: + invoice_number: + type: string description: The unique identifier for the invoice - attributeType: simple # or omit for default - evaluation_method: EXACT # Use exact string matching - - name: amount_due + x-aws-idp-evaluation-method: EXACT # Use exact string matching + amount_due: + type: string description: The total amount to be paid - evaluation_method: NUMERIC_EXACT # Use numeric comparison - - name: vendor_name + x-aws-idp-evaluation-method: NUMERIC_EXACT # Use numeric comparison + vendor_name: + type: string description: Name of the vendor - evaluation_method: FUZZY # Use fuzzy matching - evaluation_threshold: 0.8 # Minimum similarity threshold + x-aws-idp-evaluation-method: FUZZY # Use fuzzy matching + x-aws-idp-confidence-threshold: 0.8 # Minimum similarity threshold ``` ### Group Attributes @@ -142,30 +147,38 @@ Nested object structures where each sub-attribute is evaluated individually: ```yaml classes: - - name: "Bank Statement" - attributes: - - name: "Account Holder Address" + - $schema: "https://json-schema.org/draft/2020-12/schema" + $id: BankStatement + x-aws-idp-document-type: "Bank Statement" + type: object + properties: + Account Holder Address: + type: object description: "Complete address information for the account holder" - attributeType: group - groupAttributes: - - name: "Street Number" + properties: + Street Number: + type: string description: "House or building number" - evaluation_method: FUZZY - evaluation_threshold: 0.9 - - name: "Street Name" + x-aws-idp-evaluation-method: FUZZY + x-aws-idp-confidence-threshold: 0.9 + Street Name: + type: string description: "Name of the street" - evaluation_method: FUZZY - evaluation_threshold: 0.8 - - name: "City" + x-aws-idp-evaluation-method: FUZZY + x-aws-idp-confidence-threshold: 0.8 + City: + type: string description: "City name" - evaluation_method: FUZZY - evaluation_threshold: 0.9 - - name: "State" + x-aws-idp-evaluation-method: FUZZY + x-aws-idp-confidence-threshold: 0.9 + State: + type: string description: "State abbreviation (e.g., CA, NY)" - evaluation_method: EXACT - - name: "ZIP Code" + x-aws-idp-evaluation-method: EXACT + ZIP Code: + type: string description: "5 or 9 digit postal code" - evaluation_method: EXACT + x-aws-idp-evaluation-method: EXACT ``` ### List Attributes @@ -174,19 +187,25 @@ Arrays of items where each item's attributes are evaluated individually across a ```yaml classes: - - name: "Bank Statement" - attributes: - - name: "Transactions" + - $schema: "https://json-schema.org/draft/2020-12/schema" + $id: BankStatement + x-aws-idp-document-type: "Bank Statement" + type: object + properties: + Transactions: + type: array description: "List of all transactions in the statement period" - attributeType: list - listItemTemplate: - itemDescription: "Individual transaction record" - itemAttributes: - - name: "Date" + x-aws-idp-list-item-description: "Individual transaction record" + items: + type: object + properties: + Date: + type: string description: "Transaction date (MM/DD/YYYY)" - evaluation_method: FUZZY - evaluation_threshold: 0.9 - - name: "Description" + x-aws-idp-evaluation-method: FUZZY + x-aws-idp-confidence-threshold: 0.9 + Description: + type: string description: "Transaction description or merchant name" evaluation_method: SEMANTIC evaluation_threshold: 0.7 diff --git a/docs/idp-configuration-best-practices.md b/docs/idp-configuration-best-practices.md index d66376b2f..47131aa03 100644 --- a/docs/idp-configuration-best-practices.md +++ b/docs/idp-configuration-best-practices.md @@ -106,19 +106,20 @@ Attributes define the structured data to extract from documents. Comprehensive a **Good Example:** ```yaml -attributes: - - name: YTDNetPay +properties: + YTDNetPay: + type: string description: >- Year-to-date net pay amount representing cumulative take-home earnings after all deductions from the beginning of the year to the current pay period. - evaluation_method: NUMERIC_EXACT - attributeType: simple + x-aws-idp-evaluation-method: NUMERIC_EXACT ``` **Enhanced Example with Location Hints:** ```yaml -attributes: - - name: invoice_number +properties: + invoice_number: + type: string description: >- The unique identifier for this invoice, typically labeled as 'Invoice #', 'Invoice Number', or similar. Usually found in the upper portion of the document, often in a prominent box or header. @@ -128,54 +129,64 @@ attributes: **Simple Attributes** - Single value fields: ```yaml -- name: PayDate - description: >- - The actual date when the employee was paid, representing when the compensation was issued - or deposited. - evaluation_method: EXACT - attributeType: simple +properties: + PayDate: + type: string + description: >- + The actual date when the employee was paid, representing when the compensation was issued + or deposited. + x-aws-idp-evaluation-method: EXACT ``` **Group Attributes** - Nested structured data: ```yaml -- name: CompanyAddress - groupAttributes: - - name: State - description: The state or province portion of the company's business address. - evaluation_method: EXACT - - name: ZipCode - description: The postal code portion of the company's business address. - evaluation_method: EXACT - - name: City - description: The city portion of the company's business address. - evaluation_method: EXACT - description: >- - The complete business address of the employing company, including street address, - city, state, and postal code information. - evaluation_method: LLM - attributeType: group +properties: + CompanyAddress: + type: object + description: >- + The complete business address of the employing company, including street address, + city, state, and postal code information. + x-aws-idp-evaluation-method: LLM + properties: + State: + type: string + description: The state or province portion of the company's business address. + x-aws-idp-evaluation-method: EXACT + ZipCode: + type: string + description: The postal code portion of the company's business address. + x-aws-idp-evaluation-method: EXACT + City: + type: string + description: The city portion of the company's business address. + x-aws-idp-evaluation-method: EXACT ``` **List Attributes** - Arrays of structured items: ```yaml -- name: FederalTaxes - listItemTemplate: - itemAttributes: - - name: YTD - description: Year-to-date amount for this federal tax item. - evaluation_method: NUMERIC_EXACT - - name: Period - description: Current period amount for this federal tax item. - evaluation_method: NUMERIC_EXACT - - name: ItemDescription - description: Description of the specific federal tax type or category. - evaluation_method: EXACT - itemDescription: Each item represents a specific federal tax withholding category - description: >- - List of federal tax withholdings showing different types of federal taxes deducted, - with both current period and year-to-date amounts. - evaluation_method: LLM - attributeType: list +properties: + FederalTaxes: + type: array + description: >- + List of federal tax withholdings showing different types of federal taxes deducted, + with both current period and year-to-date amounts. + x-aws-idp-evaluation-method: LLM + x-aws-idp-list-item-description: Each item represents a specific federal tax withholding category + items: + type: object + properties: + YTD: + type: string + description: Year-to-date amount for this federal tax item. + x-aws-idp-evaluation-method: NUMERIC_EXACT + Period: + type: string + description: Current period amount for this federal tax item. + x-aws-idp-evaluation-method: NUMERIC_EXACT + ItemDescription: + type: string + description: Description of the specific federal tax type or category. + x-aws-idp-evaluation-method: EXACT ``` #### Evaluation Methods Integration @@ -261,63 +272,63 @@ classes: **Example 1: Employee Address vs Company Address** ```yaml -attributes: - - name: employee_address +properties: + employee_address: + type: string description: >- The residential address of the employee receiving the payslip or benefits. Usually found in the "Employee Information", "Pay To", or recipient section, often indented or in a box. This is NOT the company address, which appears in the header/letterhead area and represents the employer's business location with company logos or "From" labels. - attributeType: simple - - name: company_address + company_address: + type: string description: >- The business address of the employing company or organization. Typically found in the header, letterhead, or "From" section with company branding. This is NOT the employee address, which appears in the employee details section and represents the recipient's personal residence, often in a "Pay To" or "Mail To" area. - attributeType: simple ``` **Example 2: Bill To vs Ship To Address** ```yaml -attributes: - - name: bill_to_address +properties: + bill_to_address: + type: string description: >- The billing address where the invoice should be sent for payment processing. Usually labeled "Bill To", "Billing Address", "Invoice To", or "Accounts Payable". This is NOT the shipping address where goods are physically delivered, which is labeled "Ship To", "Delivery Address", or "Service Location". - attributeType: simple - - name: ship_to_address + ship_to_address: + type: string description: >- The delivery address where goods/services are provided or shipped. Usually labeled "Ship To", "Delivery Address", "Service Location", or "Deliver To". This is NOT the billing address where invoices are sent for payment, which is labeled "Bill To", "Billing Address", or "Accounts Payable". - attributeType: simple ``` **Example 3: Patient Name vs Physician Name** ```yaml -attributes: - - name: patient_name +properties: + patient_name: + type: string description: >- The full name of the patient receiving medical care, testing, or treatment. Usually found in patient information sections, labeled "Patient", "Patient Name", or in demographic areas. This is NOT the physician name, which appears in provider sections and may be preceded by "Dr.", "MD", found in signature areas, or labeled "Physician", "Provider". - attributeType: simple - - name: physician_name + physician_name: + type: string description: >- The name of the medical doctor or healthcare provider. Usually found in provider sections, preceded by "Dr.", "MD", or in signature areas. May be labeled "Physician", "Provider", "Attending", or "Ordering Physician". This is NOT the patient name, which appears in patient demographic sections and is labeled "Patient", "Patient Name", or in the main subject area of the document. - attributeType: simple ``` #### Best Practices for Negative Prompting @@ -1636,32 +1647,43 @@ classes: **Simple Attributes:** ```yaml -- name: date_field - description: "Specific date with clear location hint and format requirement" - evaluation_method: EXACT - attributeType: simple +properties: + date_field: + type: string + description: "Specific date with clear location hint and format requirement" + x-aws-idp-evaluation-method: EXACT ``` **Complex Nested Structures:** ```yaml -- name: address_group - groupAttributes: - - name: street - - name: city - - name: state - - name: zip_code - attributeType: group +properties: + address_group: + type: object + properties: + street: + type: string + city: + type: string + state: + type: string + zip_code: + type: string ``` **Dynamic Lists:** ```yaml -- name: transaction_list - listItemTemplate: - itemAttributes: - - name: date - - name: amount - - name: description - attributeType: list +properties: + transaction_list: + type: array + items: + type: object + properties: + date: + type: string + amount: + type: string + description: + type: string ``` ### Prompt Templates diff --git a/lib/idp_common_pkg/idp_common/bedrock/client.py b/lib/idp_common_pkg/idp_common/bedrock/client.py index f7191ed33..4d910d787 100644 --- a/lib/idp_common_pkg/idp_common/bedrock/client.py +++ b/lib/idp_common_pkg/idp_common/bedrock/client.py @@ -16,23 +16,47 @@ import copy import random import socket -from typing import Dict, Any, List, Optional, Union, Tuple +from typing import Dict, Any, List, Optional, Union, Tuple, Type from botocore.config import Config -from botocore.exceptions import ClientError, ReadTimeoutError, ConnectTimeoutError, EndpointConnectionError +from botocore.exceptions import ( + ClientError, + ReadTimeoutError, + ConnectTimeoutError, + EndpointConnectionError, +) from urllib3.exceptions import ReadTimeoutError as Urllib3ReadTimeoutError + + +# Dummy exception classes for requests timeouts if requests is not available +class _RequestsReadTimeout(Exception): + """Fallback exception class when requests library is not available.""" + + pass + + +class _RequestsConnectTimeout(Exception): + """Fallback exception class when requests library is not available.""" + + pass + + try: - from requests.exceptions import ReadTimeout as RequestsReadTimeout, ConnectTimeout as RequestsConnectTimeout + from requests.exceptions import ( + ReadTimeout as RequestsReadTimeout, + ConnectTimeout as RequestsConnectTimeout, + ) except ImportError: - # Fallback if requests is not available - RequestsReadTimeout = Exception - RequestsConnectTimeout = Exception + # Fallback if requests is not available - use dummy exception classes + RequestsReadTimeout = _RequestsReadTimeout # type: ignore[misc,assignment] + RequestsConnectTimeout = _RequestsConnectTimeout # type: ignore[misc,assignment] + logger = logging.getLogger(__name__) # Default retry settings DEFAULT_MAX_RETRIES = 7 DEFAULT_INITIAL_BACKOFF = 2 # seconds -DEFAULT_MAX_BACKOFF = 300 # 5 minutes +DEFAULT_MAX_BACKOFF = 300 # 5 minutes # Models that support cachePoint functionality @@ -47,23 +71,24 @@ "us.anthropic.claude-sonnet-4-5-20250929-v1:0", "us.anthropic.claude-sonnet-4-5-20250929-v1:0:1m", "us.amazon.nova-lite-v1:0", - "us.amazon.nova-pro-v1:0" + "us.amazon.nova-pro-v1:0", ] + class BedrockClient: """Client for interacting with Amazon Bedrock models.""" - + def __init__( - self, + self, region: Optional[str] = None, max_retries: int = DEFAULT_MAX_RETRIES, initial_backoff: float = DEFAULT_INITIAL_BACKOFF, max_backoff: float = DEFAULT_MAX_BACKOFF, - metrics_enabled: bool = True + metrics_enabled: bool = True, ): """ Initialize a Bedrock client. - + Args: region: AWS region (defaults to AWS_REGION env var or us-west-2) max_retries: Maximum number of retry attempts @@ -71,24 +96,26 @@ def __init__( max_backoff: Maximum backoff time in seconds metrics_enabled: Whether to publish metrics """ - self.region = region or os.environ.get('AWS_REGION') + self.region = region or os.environ.get("AWS_REGION") self.max_retries = max_retries self.initial_backoff = initial_backoff self.max_backoff = max_backoff self.metrics_enabled = metrics_enabled self._client = None - + @property def client(self): """Lazy-loaded Bedrock client.""" config = Config( connect_timeout=10, - read_timeout=300 # allow plenty of time for large extraction or assessment inferences - ) + read_timeout=300, # allow plenty of time for large extraction or assessment inferences + ) if self._client is None: - self._client = boto3.client('bedrock-runtime', region_name=self.region, config=config) + self._client = boto3.client( + "bedrock-runtime", region_name=self.region, config=config + ) return self._client - + def __call__( self, model_id: str, @@ -99,13 +126,13 @@ def __call__( top_p: Optional[Union[float, str]] = None, max_tokens: Optional[Union[int, str]] = None, max_retries: Optional[int] = None, - context: str = "Unspecified" + context: str = "Unspecified", ) -> Dict[str, Any]: """ Make the instance callable with the same signature as the original function. - + This allows instances to be used as drop-in replacements for the function. - + Args: model_id: The Bedrock model ID (e.g., 'anthropic.claude-3-sonnet-20240229-v1:0') system_prompt: The system prompt as string or list of content objects @@ -115,13 +142,15 @@ def __call__( top_p: Optional top_p parameter (float or string) max_tokens: Optional max_tokens parameter (int or string) max_retries: Optional override for the instance's max_retries setting - + Returns: Bedrock response object with metering information """ # Use instance max_retries if not overridden - effective_max_retries = max_retries if max_retries is not None else self.max_retries - + effective_max_retries = ( + max_retries if max_retries is not None else self.max_retries + ) + return self.invoke_model( model_id=model_id, system_prompt=system_prompt, @@ -131,65 +160,87 @@ def __call__( top_p=top_p, max_tokens=max_tokens, max_retries=effective_max_retries, - context=context + context=context, ) - - def _preprocess_content_for_cachepoint(self, content: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + + def _preprocess_content_for_cachepoint( + self, content: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: """ Process content list to handle <> tags in text elements. - + For text elements containing <> tags, this function will split the text and insert cachePoint elements at the tag positions. - + Args: content: The content list for the user message (can include text and images) - + Returns: Processed content list with cachePoint elements inserted """ if not content: return content - + processed_content = [] cachepoint_count = 0 - + for item in content: # If it's a text element, check for <> tags - if "text" in item and isinstance(item["text"], str) and "<>" in item["text"]: + if ( + "text" in item + and isinstance(item["text"], str) + and "<>" in item["text"] + ): # Log that we found a cachepoint tag - logger.debug(f"Found <> tags in text content: {item['text'][:50]}...") - + logger.debug( + f"Found <> tags in text content: {item['text'][:50]}..." + ) + # Split the text by the tag text_parts = item["text"].split("<>") - logger.debug(f"Split text into {len(text_parts)} parts at cachepoint tags") - + logger.debug( + f"Split text into {len(text_parts)} parts at cachepoint tags" + ) + # Add each text part interspersed with cachePoint elements for i, text_part in enumerate(text_parts): # Only add non-empty text parts if text_part: # Count words in this part word_count = len(text_part.split()) - logger.debug(f"Text part {i+1}: {word_count} words") + logger.debug(f"Text part {i + 1}: {word_count} words") processed_content.append({"text": text_part}) else: - logger.debug(f"Text part {i+1}: Empty, skipping") - + logger.debug(f"Text part {i + 1}: Empty, skipping") + # Add cachePoint after each text part except the last one if i < len(text_parts) - 1: cachepoint_count += 1 - logger.debug(f"Inserting cachePoint #{cachepoint_count} after text part {i+1}") + logger.debug( + f"Inserting cachePoint #{cachepoint_count} after text part {i + 1}" + ) processed_content.append({"cachePoint": {"type": "default"}}) else: # If not a text element or no tags, add it as is - content_type = "text" if "text" in item else "image" if "image" in item else "other" - logger.debug(f"No cachepoint tags in {content_type} content, passing through unchanged") + content_type = ( + "text" + if "text" in item + else "image" + if "image" in item + else "other" + ) + logger.debug( + f"No cachepoint tags in {content_type} content, passing through unchanged" + ) processed_content.append(item) - + if cachepoint_count > 0: - logger.info(f"Processed content with {cachepoint_count} cachepoint insertions") - + logger.info( + f"Processed content with {cachepoint_count} cachepoint insertions" + ) + return processed_content - + def invoke_model( self, model_id: str, @@ -200,11 +251,11 @@ def invoke_model( top_p: Optional[Union[float, str]] = 0.1, max_tokens: Optional[Union[int, str]] = None, max_retries: Optional[int] = None, - context: str = "Unspecified" + context: str = "Unspecified", ) -> Dict[str, Any]: """ Invoke a Bedrock model with retry logic. - + Args: model_id: The Bedrock model ID (e.g., 'anthropic.claude-3-sonnet-20240229-v1:0') system_prompt: The system prompt as string or list of content objects @@ -214,64 +265,78 @@ def invoke_model( top_p: Optional top_p parameter (float or string) max_tokens: Optional max_tokens parameter (int or string) max_retries: Optional override for the instance's max_retries setting - + Returns: Bedrock response object with metering information """ # Track total requests - self._put_metric('BedrockRequestsTotal', 1) - + self._put_metric("BedrockRequestsTotal", 1) + # Use instance max_retries if not overridden - effective_max_retries = max_retries if max_retries is not None else self.max_retries - + effective_max_retries = ( + max_retries if max_retries is not None else self.max_retries + ) + # Format system prompt if needed if isinstance(system_prompt, str): formatted_system_prompt = [{"text": system_prompt}] else: formatted_system_prompt = system_prompt - + # Check for cachePoint tags in content - has_cachepoint_tags = any("text" in item and isinstance(item["text"], str) and "<>" in item["text"] for item in content) - + has_cachepoint_tags = any( + "text" in item + and isinstance(item["text"], str) + and "<>" in item["text"] + for item in content + ) + if has_cachepoint_tags: if model_id in CACHEPOINT_SUPPORTED_MODELS: # Process content for cachePoint tags with supported model processed_content = self._preprocess_content_for_cachepoint(content) - logger.info(f"Applied cachePoint processing for supported model: {model_id}") + logger.info( + f"Applied cachePoint processing for supported model: {model_id}" + ) else: # For unsupported models, just remove the <> tags but keep content intact processed_content = [] for item in content: - if "text" in item and isinstance(item["text"], str) and "<>" in item["text"]: + if ( + "text" in item + and isinstance(item["text"], str) + and "<>" in item["text"] + ): # Remove the cachepoint tags but keep the text clean_text = item["text"].replace("<>", "") processed_content.append({"text": clean_text}) - logger.warning(f"Removed <> tags for unsupported model: {model_id}. CachePoint is only supported for: {', '.join(CACHEPOINT_SUPPORTED_MODELS)}") + logger.warning( + f"Removed <> tags for unsupported model: {model_id}. CachePoint is only supported for: {', '.join(CACHEPOINT_SUPPORTED_MODELS)}" + ) else: # Pass through unchanged processed_content.append(item) else: # No cachepoint tags, use content as is processed_content = content - + # Build message - message = { - "role": "user", - "content": processed_content - } + message = {"role": "user", "content": processed_content} messages = [message] - + # Convert temperature to float if it's a string if isinstance(temperature, str): try: temperature = float(temperature) except ValueError: - logger.warning(f"Failed to convert temperature value '{temperature}' to float. Using default 0.0") + logger.warning( + f"Failed to convert temperature value '{temperature}' to float. Using default 0.0" + ) temperature = 0.0 - + # Initialize inference config with temperature inference_config = {"temperature": temperature} - + # Handle top_p parameter - only use if temperature is 0 or not specified # Some models don't allow both temperature and top_p to be specified if top_p is not None and temperature == 0.0: @@ -280,14 +345,16 @@ def invoke_model( try: top_p = float(top_p) except ValueError: - logger.warning(f"Failed to convert top_p value '{top_p}' to float. Not using top_p.") + logger.warning( + f"Failed to convert top_p value '{top_p}' to float. Not using top_p." + ) top_p = None - + if top_p is not None: inference_config["topP"] = top_p # Remove temperature when using top_p to avoid conflicts del inference_config["temperature"] - + # Handle max_tokens parameter if max_tokens is not None: # Convert max_tokens to int if it's a string @@ -295,16 +362,18 @@ def invoke_model( try: max_tokens = int(max_tokens) except ValueError: - logger.warning(f"Failed to convert max_tokens value '{max_tokens}' to int. Not using max_tokens.") + logger.warning( + f"Failed to convert max_tokens value '{max_tokens}' to int. Not using max_tokens." + ) max_tokens = None - + # Add to inferenceConfig as maxTokens for Nova models if max_tokens is not None and "amazon" in model_id.lower(): inference_config["maxTokens"] = max_tokens - + # Add additional model fields if needed - additional_model_fields = {} - + additional_model_fields = {} + # Handle top_k parameter if top_k is not None: # Convert top_k to float if it's a string @@ -312,18 +381,20 @@ def invoke_model( try: top_k = float(top_k) except ValueError: - logger.warning(f"Failed to convert top_k value '{top_k}' to float. Not using top_k.") + logger.warning( + f"Failed to convert top_k value '{top_k}' to float. Not using top_k." + ) top_k = None - + # Handle model-specific parameters if "anthropic" in model_id.lower(): # Add parameters to additionalModelRequestFields for Claude (snake_case) if top_k is not None: additional_model_fields["top_k"] = top_k - + if max_tokens is not None: additional_model_fields["max_tokens"] = max_tokens - + # Handle Nova-specific parameters elif "amazon" in model_id.lower(): # For Nova models, topK should be in additionalModelRequestFields.inferenceConfig @@ -336,35 +407,35 @@ def invoke_model( # Add 1M context headers if needed use_model_id = model_id - if model_id and model_id.endswith(':1m'): + if model_id and model_id.endswith(":1m"): use_model_id = model_id[:-3] # Remove ':1m' if additional_model_fields is None: additional_model_fields = {} additional_model_fields["anthropic_beta"] = ["context-1m-2025-08-07"] - + # If no additional model fields were added, set to None if not additional_model_fields: additional_model_fields = None - + # Get guardrail configuration if available guardrail_config = self.get_guardrail_config() - + # Build converse parameters - converse_params = { + converse_params: Dict[str, Any] = { "modelId": use_model_id, "messages": messages, "system": formatted_system_prompt, "inferenceConfig": inference_config, - "additionalModelRequestFields": additional_model_fields + "additionalModelRequestFields": additional_model_fields, } - + # Add guardrail config if available if guardrail_config: converse_params["guardrailConfig"] = guardrail_config - + # Start timing the entire request request_start_time = time.time() - + # Call the recursive retry function result = self._invoke_with_retry( model_id=model_id, @@ -372,9 +443,9 @@ def invoke_model( retry_count=0, max_retries=effective_max_retries, request_start_time=request_start_time, - context=context + context=context, ) - + return result def _invoke_with_retry( @@ -384,22 +455,22 @@ def _invoke_with_retry( retry_count: int, max_retries: int, request_start_time: float, - last_exception: Exception = None, - context: str = "Unspecified" + last_exception: Optional[Exception] = None, + context: str = "Unspecified", ) -> Dict[str, Any]: """ Recursive helper method to handle retries for Bedrock invocation. - + Args: converse_params: Parameters for the Bedrock converse API call retry_count: Current retry attempt (0-based) max_retries: Maximum number of retry attempts request_start_time: Time when the original request started last_exception: The last exception encountered (for final error reporting) - + Returns: Bedrock response object with metering information - + Raises: Exception: The last exception encountered if max retries are exceeded """ @@ -407,105 +478,117 @@ def _invoke_with_retry( # Create a copy of the messages to sanitize for logging sanitized_params = copy.deepcopy(converse_params) if "messages" in sanitized_params: - sanitized_params["messages"] = self._sanitize_messages_for_logging(sanitized_params["messages"]) - + sanitized_params["messages"] = self._sanitize_messages_for_logging( + sanitized_params["messages"] + ) + # Log detailed request parameters logger.info(f"Bedrock request attempt {retry_count + 1}/{max_retries}:") logger.info(f" - model: {converse_params['modelId']}") logger.info(f" - inferenceConfig: {converse_params['inferenceConfig']}") logger.info(f" - system: {converse_params['system']}") logger.info(f" - messages: {sanitized_params['messages']}") - logger.info(f" - additionalModelRequestFields: {converse_params['additionalModelRequestFields']}") - + logger.info( + f" - additionalModelRequestFields: {converse_params['additionalModelRequestFields']}" + ) + # Log guardrail usage if configured if "guardrailConfig" in converse_params: - logger.debug(f" - guardrailConfig: {converse_params['guardrailConfig']}") - + logger.debug( + f" - guardrailConfig: {converse_params['guardrailConfig']}" + ) + # Start timing this attempt attempt_start_time = time.time() # Make the API call response = self.client.converse(**converse_params) - + # Calculate duration duration = time.time() - attempt_start_time - + # Log response details, but sanitize large content sanitized_response = self._sanitize_response_for_logging(response) - logger.info(f"Bedrock request successful after {retry_count + 1} attempts. Duration: {duration:.2f}s") + logger.info( + f"Bedrock request successful after {retry_count + 1} attempts. Duration: {duration:.2f}s" + ) logger.debug(f"Response: {sanitized_response}") logger.info(f"Token Usage: {response.get('usage')}") # Track successful requests and latency - self._put_metric('BedrockRequestsSucceeded', 1) - self._put_metric('BedrockRequestLatency', duration * 1000, 'Milliseconds') + self._put_metric("BedrockRequestsSucceeded", 1) + self._put_metric("BedrockRequestLatency", duration * 1000, "Milliseconds") if retry_count > 0: - self._put_metric('BedrockRetrySuccess', 1) - + self._put_metric("BedrockRetrySuccess", 1) + # Track token usage - if 'usage' in response: - inputTokens = response['usage'].get('inputTokens', 0) - outputTokens = response['usage'].get('outputTokens', 0) - total_tokens = response['usage'].get('totalTokens', 0) - cacheReadInputTokens = response['usage'].get('cacheReadInputTokens', 0) - cacheWriteInputTokens = response['usage'].get('cacheWriteInputTokens', 0) - self._put_metric('InputTokens', inputTokens) - self._put_metric('OutputTokens', outputTokens) - self._put_metric('TotalTokens', total_tokens) - self._put_metric('CacheReadInputTokens', cacheReadInputTokens) - self._put_metric('CacheWriteInputTokens', cacheWriteInputTokens) - + if "usage" in response: + inputTokens = response["usage"].get("inputTokens", 0) + outputTokens = response["usage"].get("outputTokens", 0) + total_tokens = response["usage"].get("totalTokens", 0) + cacheReadInputTokens = response["usage"].get("cacheReadInputTokens", 0) + cacheWriteInputTokens = response["usage"].get( + "cacheWriteInputTokens", 0 + ) + self._put_metric("InputTokens", inputTokens) + self._put_metric("OutputTokens", outputTokens) + self._put_metric("TotalTokens", total_tokens) + self._put_metric("CacheReadInputTokens", cacheReadInputTokens) + self._put_metric("CacheWriteInputTokens", cacheWriteInputTokens) + # Calculate total duration total_duration = time.time() - request_start_time - self._put_metric('BedrockTotalLatency', total_duration * 1000, 'Milliseconds') - + self._put_metric( + "BedrockTotalLatency", total_duration * 1000, "Milliseconds" + ) + # Create metering data - usage = response.get('usage', {}) + usage = response.get("usage", {}) response_with_metering = { "response": response, - "metering": { - f"{context}/bedrock/{model_id}": { - **usage - } - } + "metering": {f"{context}/bedrock/{model_id}": {**usage}}, } - + return response_with_metering - + except ClientError as e: # Handle boto3/botocore client errors (have response structure) - error_code = e.response['Error']['Code'] - error_message = e.response['Error']['Message'] - + error_code = e.response["Error"]["Code"] + error_message = e.response["Error"]["Message"] + retryable_errors = [ - 'ThrottlingException', - 'ServiceQuotaExceededException', - 'RequestLimitExceeded', - 'TooManyRequestsException', - 'ServiceUnavailableException', - 'ModelErrorException', - 'RequestTimeout', - 'RequestTimeoutException' + "ThrottlingException", + "ServiceQuotaExceededException", + "RequestLimitExceeded", + "TooManyRequestsException", + "ServiceUnavailableException", + "ModelErrorException", + "RequestTimeout", + "RequestTimeoutException", ] - + if error_code in retryable_errors: - self._put_metric('BedrockThrottles', 1) - + self._put_metric("BedrockThrottles", 1) + # Check if we've reached max retries if retry_count >= max_retries: - logger.error(f"Max retries ({max_retries}) exceeded. Last error: {error_message}") - self._put_metric('BedrockRequestsFailed', 1) - self._put_metric('BedrockMaxRetriesExceeded', 1) + logger.error( + f"Max retries ({max_retries}) exceeded. Last error: {error_message}" + ) + self._put_metric("BedrockRequestsFailed", 1) + self._put_metric("BedrockMaxRetriesExceeded", 1) raise - + # Calculate backoff time backoff = self._calculate_backoff(retry_count) - logger.warning(f"Bedrock throttling occurred (attempt {retry_count + 1}/{max_retries}). " - f"Error: {error_message}. " - f"Backing off for {backoff:.2f}s") - + logger.warning( + f"Bedrock throttling occurred (attempt {retry_count + 1}/{max_retries}). " + f"Error: {error_message}. " + f"Backing off for {backoff:.2f}s" + ) + # Sleep for backoff period time.sleep(backoff) - + # Recursive call with incremented retry count return self._invoke_with_retry( model_id=model_id, @@ -514,37 +597,49 @@ def _invoke_with_retry( max_retries=max_retries, request_start_time=request_start_time, last_exception=e, - context=context + context=context, ) else: - logger.error(f"Non-retryable Bedrock error: {error_code} - {error_message}") - self._put_metric('BedrockRequestsFailed', 1) - self._put_metric('BedrockNonRetryableErrors', 1) + logger.error( + f"Non-retryable Bedrock error: {error_code} - {error_message}" + ) + self._put_metric("BedrockRequestsFailed", 1) + self._put_metric("BedrockNonRetryableErrors", 1) raise - - except (ReadTimeoutError, ConnectTimeoutError, EndpointConnectionError, - Urllib3ReadTimeoutError, RequestsReadTimeout, RequestsConnectTimeout) as e: + + except ( + ReadTimeoutError, + ConnectTimeoutError, + EndpointConnectionError, + Urllib3ReadTimeoutError, + RequestsReadTimeout, + RequestsConnectTimeout, + ) as e: # Handle timeout and connection errors (these are retryable) error_message = str(e) - - self._put_metric('BedrockTimeouts', 1) - + + self._put_metric("BedrockTimeouts", 1) + # Check if we've reached max retries if retry_count >= max_retries: - logger.error(f"Max retries ({max_retries}) exceeded. Last timeout error: {error_message}") - self._put_metric('BedrockRequestsFailed', 1) - self._put_metric('BedrockMaxRetriesExceeded', 1) + logger.error( + f"Max retries ({max_retries}) exceeded. Last timeout error: {error_message}" + ) + self._put_metric("BedrockRequestsFailed", 1) + self._put_metric("BedrockMaxRetriesExceeded", 1) raise - + # Calculate backoff time backoff = self._calculate_backoff(retry_count) - logger.warning(f"Bedrock timeout occurred (attempt {retry_count + 1}/{max_retries}). " - f"Error: {error_message}. " - f"Backing off for {backoff:.2f}s") - + logger.warning( + f"Bedrock timeout occurred (attempt {retry_count + 1}/{max_retries}). " + f"Error: {error_message}. " + f"Backing off for {backoff:.2f}s" + ) + # Sleep for backoff period time.sleep(backoff) - + # Recursive call with incremented retry count return self._invoke_with_retry( model_id=model_id, @@ -553,93 +648,94 @@ def _invoke_with_retry( max_retries=max_retries, request_start_time=request_start_time, last_exception=e, - context=context + context=context, ) - + except Exception as e: # Handle unexpected errors (not retryable) error_message = str(e) logger.error(f"Unexpected Bedrock error: {error_message}", exc_info=True) - self._put_metric('BedrockRequestsFailed', 1) - self._put_metric('BedrockUnexpectedErrors', 1) + self._put_metric("BedrockRequestsFailed", 1) + self._put_metric("BedrockUnexpectedErrors", 1) raise - def get_guardrail_config(self) -> Optional[Dict[str, str]]: """ Get guardrail configuration from environment if available. - + Returns: Optional guardrail configuration dict with id and version """ guardrail_env = os.environ.get("GUARDRAIL_ID_AND_VERSION", "") if not guardrail_env: return None - + try: guardrail_id, guardrail_version = guardrail_env.split(":") if guardrail_id and guardrail_version: - logger.debug(f"Using Bedrock Guardrail ID: {guardrail_id}, Version: {guardrail_version}") + logger.debug( + f"Using Bedrock Guardrail ID: {guardrail_id}, Version: {guardrail_version}" + ) return { "guardrailIdentifier": guardrail_id, "guardrailVersion": guardrail_version, - "trace": "enabled" # Enable tracing for guardrail violations + "trace": "enabled", # Enable tracing for guardrail violations } except ValueError: - logger.warning(f"Invalid GUARDRAIL_ID_AND_VERSION format: {guardrail_env}. Expected format: 'id:version'") - + logger.warning( + f"Invalid GUARDRAIL_ID_AND_VERSION format: {guardrail_env}. Expected format: 'id:version'" + ) + return None - + def generate_embedding( - self, - text: str, + self, + text: str, model_id: str = "amazon.titan-embed-text-v1", - max_retries: Optional[int] = None + max_retries: Optional[int] = None, ) -> List[float]: """ Generate an embedding vector for the given text using Amazon Bedrock. - + Args: text: The text to generate embeddings for model_id: The embedding model ID to use (default: amazon.titan-embed-text-v1) max_retries: Optional override for the instance's max_retries setting - + Returns: List of floats representing the embedding vector """ if not text or not isinstance(text, str): # Return an empty vector for empty input return [] - + # Use instance max_retries if not overridden - effective_max_retries = max_retries if max_retries is not None else self.max_retries - + effective_max_retries = ( + max_retries if max_retries is not None else self.max_retries + ) + # Track total embedding requests - self._put_metric('BedrockEmbeddingRequestsTotal', 1) - + self._put_metric("BedrockEmbeddingRequestsTotal", 1) + # Normalize whitespace and prepare the input text normalized_text = " ".join(text.split()) - + # Prepare the request body based on the model if "amazon.titan-embed" in model_id: - request_body = json.dumps({ - "inputText": normalized_text - }) + request_body = json.dumps({"inputText": normalized_text}) else: # Default format for other models - request_body = json.dumps({ - "text": normalized_text - }) - + request_body = json.dumps({"text": normalized_text}) + # Call the recursive embedding function return self._generate_embedding_with_retry( model_id=model_id, request_body=request_body, normalized_text=normalized_text, retry_count=0, - max_retries=effective_max_retries + max_retries=effective_max_retries, ) - + def _generate_embedding_with_retry( self, model_id: str, @@ -647,11 +743,11 @@ def _generate_embedding_with_retry( normalized_text: str, retry_count: int, max_retries: int, - last_exception: Exception = None + last_exception: Optional[Exception] = None, ) -> List[float]: """ Recursive helper method to handle retries for embedding generation. - + Args: model_id: The embedding model ID request_body: JSON request body for the API call @@ -659,79 +755,87 @@ def _generate_embedding_with_retry( retry_count: Current retry attempt (0-based) max_retries: Maximum number of retry attempts last_exception: The last exception encountered (for final error reporting) - + Returns: List of floats representing the embedding vector - + Raises: Exception: The last exception encountered if max retries are exceeded """ try: - logger.info(f"Bedrock embedding request attempt {retry_count + 1}/{max_retries}:") + logger.info( + f"Bedrock embedding request attempt {retry_count + 1}/{max_retries}:" + ) logger.debug(f" - model: {model_id}") logger.debug(f" - input text length: {len(normalized_text)} characters") - + attempt_start_time = time.time() response = self.client.invoke_model( modelId=model_id, contentType="application/json", accept="application/json", - body=request_body + body=request_body, ) duration = time.time() - attempt_start_time - + # Extract the embedding vector from response response_body = json.loads(response["body"].read()) - + # Handle different response formats based on the model if "amazon.titan-embed" in model_id: embedding = response_body.get("embedding", []) else: # Default extraction format embedding = response_body.get("embedding", []) - + # Track successful requests and latency - self._put_metric('BedrockEmbeddingRequestsSucceeded', 1) - self._put_metric('BedrockEmbeddingRequestLatency', duration * 1000, 'Milliseconds') - + self._put_metric("BedrockEmbeddingRequestsSucceeded", 1) + self._put_metric( + "BedrockEmbeddingRequestLatency", duration * 1000, "Milliseconds" + ) + logger.debug(f"Generated embedding with {len(embedding)} dimensions") return embedding - + except ClientError as e: - error_code = e.response['Error']['Code'] - error_message = e.response['Error']['Message'] - + error_code = e.response["Error"]["Code"] + error_message = e.response["Error"]["Message"] + retryable_errors = [ - 'ThrottlingException', - 'ServiceQuotaExceededException', - 'RequestLimitExceeded', - 'TooManyRequestsException', - 'ServiceUnavailableException', - 'RequestTimeout', - 'ReadTimeout', - 'TimeoutError', - 'RequestTimeoutException' + "ThrottlingException", + "ServiceQuotaExceededException", + "RequestLimitExceeded", + "TooManyRequestsException", + "ServiceUnavailableException", + "RequestTimeout", + "ReadTimeout", + "TimeoutError", + "RequestTimeoutException", ] - + if error_code in retryable_errors: - self._put_metric('BedrockEmbeddingThrottles', 1) - + self._put_metric("BedrockEmbeddingThrottles", 1) + # Check if we've reached max retries if retry_count >= max_retries: - logger.error(f"Max retries ({max_retries}) exceeded for embedding. Last error: {error_message}") - self._put_metric('BedrockEmbeddingRequestsFailed', 1) - self._put_metric('BedrockEmbeddingMaxRetriesExceeded', 1) + logger.error( + f"Max retries ({max_retries}) exceeded for embedding. Last error: {error_message}" + ) + self._put_metric("BedrockEmbeddingRequestsFailed", 1) + self._put_metric("BedrockEmbeddingMaxRetriesExceeded", 1) raise - + # Calculate backoff time backoff = self._calculate_backoff(retry_count) - logger.warning(f"Bedrock throttling occurred (attempt {retry_count + 1}/{max_retries}). " - f"Error: {error_message}. " - f"Backing off for {backoff:.2f}s") - + logger.warning( + f"Bedrock throttling occurred (attempt {retry_count + 1}/{max_retries}). " + f"Error: {error_message}. " + f"Backing off for {backoff:.2f}s" + ) + # Sleep for backoff period time.sleep(backoff) - + # Recursive call with incremented retry count return self._generate_embedding_with_retry( model_id=model_id, @@ -739,94 +843,101 @@ def _generate_embedding_with_retry( normalized_text=normalized_text, retry_count=retry_count + 1, max_retries=max_retries, - last_exception=e + last_exception=e, ) else: - logger.error(f"Non-retryable Bedrock error for embedding: {error_code} - {error_message}") - self._put_metric('BedrockEmbeddingRequestsFailed', 1) - self._put_metric('BedrockEmbeddingNonRetryableErrors', 1) + logger.error( + f"Non-retryable Bedrock error for embedding: {error_code} - {error_message}" + ) + self._put_metric("BedrockEmbeddingRequestsFailed", 1) + self._put_metric("BedrockEmbeddingNonRetryableErrors", 1) raise - + except Exception as e: - logger.error(f"Unexpected error generating embedding: {str(e)}", exc_info=True) - self._put_metric('BedrockEmbeddingRequestsFailed', 1) - self._put_metric('BedrockEmbeddingUnexpectedErrors', 1) + logger.error( + f"Unexpected error generating embedding: {str(e)}", exc_info=True + ) + self._put_metric("BedrockEmbeddingRequestsFailed", 1) + self._put_metric("BedrockEmbeddingUnexpectedErrors", 1) raise - + def extract_text_from_response(self, response: Dict[str, Any]) -> str: """ Extract text from a Bedrock response. - + Args: response: Bedrock response object - + Returns: Extracted text content """ response_obj = response.get("response", response) - return response_obj['output']['message']['content'][0].get("text", "") - + return response_obj["output"]["message"]["content"][0].get("text", "") + def format_prompt( - self, - prompt_template: str, - substitutions: Dict[str, str], - required_placeholders: List[str] = None + self, + prompt_template: str, + substitutions: dict[str, str], + required_placeholders: list[str] | None = None, ) -> str: """ Prepare prompt from template by replacing placeholders with values. - + Args: prompt_template: The prompt template with placeholders in {PLACEHOLDER} format substitutions: Dictionary of placeholder values required_placeholders: List of placeholder names that must be present in the template - + Returns: String with placeholders replaced by values - + Raises: ValueError: If a required placeholder is missing from the template """ # Validate required placeholders if specified if required_placeholders: - missing_placeholders = [p for p in required_placeholders if f"{{{p}}}" not in prompt_template] + missing_placeholders = [ + p for p in required_placeholders if f"{{{p}}}" not in prompt_template + ] if missing_placeholders: - raise ValueError(f"Prompt template must contain the following placeholders: {', '.join([f'{{{p}}}' for p in missing_placeholders])}") - + raise ValueError( + f"Prompt template must contain the following placeholders: {', '.join([f'{{{p}}}' for p in missing_placeholders])}" + ) + # Check if template uses {PLACEHOLDER} format and convert to %(PLACEHOLDER)s for secure replacement if any(f"{{{key}}}" in prompt_template for key in substitutions): for key in substitutions: placeholder = f"{{{key}}}" if placeholder in prompt_template: prompt_template = prompt_template.replace(placeholder, f"%({key})s") - + # Apply substitutions using % operator which is safer than .format() return prompt_template % substitutions - + def _calculate_backoff(self, retry_count: int) -> float: """ Calculate exponential backoff time with jitter. - + Args: retry_count: Current retry attempt (0-based) - + Returns: Backoff time in seconds """ # Exponential backoff with base of 2 - backoff_seconds = min( - self.max_backoff, - self.initial_backoff * (2 ** retry_count) - ) - + backoff_seconds = min(self.max_backoff, self.initial_backoff * (2**retry_count)) + # Add jitter (random value between 0 and 1 second) jitter = random.random() - + return backoff_seconds + jitter - - def _put_metric(self, metric_name: str, value: Union[int, float], unit: str = 'Count'): + + def _put_metric( + self, metric_name: str, value: Union[int, float], unit: str = "Count" + ): """ Publish a metric if metrics are enabled. - + Args: metric_name: Name of the metric value: Metric value @@ -835,75 +946,87 @@ def _put_metric(self, metric_name: str, value: Union[int, float], unit: str = 'C if self.metrics_enabled: try: from ..metrics import put_metric + put_metric(metric_name, value, unit) except Exception as e: logger.warning(f"Failed to publish metric {metric_name}: {str(e)}") - - def _sanitize_messages_for_logging(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + + def _sanitize_messages_for_logging( + self, messages: List[Dict[str, Any]] + ) -> List[Dict[str, Any]]: """ Create a copy of messages with image content replaced for logging. - + Args: messages: List of message objects for Bedrock API - + Returns: Sanitized message objects suitable for logging """ sanitized = copy.deepcopy(messages) - + for message in sanitized: - if 'content' in message and isinstance(message['content'], list): - for content_item in message['content']: + if "content" in message and isinstance(message["content"], list): + for content_item in message["content"]: # Check for image type content - if isinstance(content_item, dict) and content_item.get('type') == 'image': + if ( + isinstance(content_item, dict) + and content_item.get("type") == "image" + ): # Replace actual image data with placeholder - if 'source' in content_item: - content_item['source'] = {'data': '[image_data]'} - elif isinstance(content_item, dict) and 'image' in content_item: + if "source" in content_item: + content_item["source"] = {"data": "[image_data]"} + elif isinstance(content_item, dict) and "image" in content_item: # Handle different image format used by some models - content_item['image'] = '[image_data]' - elif isinstance(content_item, dict) and 'bytes' in content_item: + content_item["image"] = "[image_data]" + elif isinstance(content_item, dict) and "bytes" in content_item: # Handle raw binary format - content_item['bytes'] = '[binary_data]' - elif isinstance(content_item, dict) and 'document' in content_item: + content_item["bytes"] = "[binary_data]" + elif isinstance(content_item, dict) and "document" in content_item: # Handle different image format used by some models - content_item['document'] = '[document_data]' - + content_item["document"] = "[document_data]" + return sanitized - - def _sanitize_response_for_logging(self, response: Dict[str, Any]) -> Dict[str, Any]: + + def _sanitize_response_for_logging( + self, response: Dict[str, Any] + ) -> Dict[str, Any]: """ Create a sanitized copy of the response suitable for logging. - + Args: response: Response from Bedrock API - + Returns: Sanitized response suitable for logging """ # Create a deep copy to avoid modifying the original sanitized = copy.deepcopy(response) - + # For very large responses, limit the content for logging - if 'output' in sanitized and 'message' in sanitized['output']: - message = sanitized['output']['message'] - if 'content' in message: - content = message['content'] - + if "output" in sanitized and "message" in sanitized["output"]: + message = sanitized["output"]["message"] + if "content" in message: + content = message["content"] + # Handle list of content items (multimodal responses) if isinstance(content, list): for i, item in enumerate(content): if isinstance(item, dict): # Truncate text content if too long - if 'text' in item and isinstance(item['text'], str) and len(item['text']) > 500: - item['text'] = item['text'][:500] + '... [truncated]' + if ( + "text" in item + and isinstance(item["text"], str) + and len(item["text"]) > 500 + ): + item["text"] = item["text"][:500] + "... [truncated]" # Replace image data with placeholder - if 'image' in item: - item['image'] = '[image_data]' + if "image" in item: + item["image"] = "[image_data]" # Handle string content elif isinstance(content, str) and len(content) > 500: - message['content'] = content[:500] + '... [truncated]' - + message["content"] = content[:500] + "... [truncated]" + return sanitized diff --git a/lib/idp_common_pkg/idp_common/classification/service.py b/lib/idp_common_pkg/idp_common/classification/service.py index 02f717143..4579eed49 100644 --- a/lib/idp_common_pkg/idp_common/classification/service.py +++ b/lib/idp_common_pkg/idp_common/classification/service.py @@ -40,7 +40,9 @@ from idp_common.config.models import IDPConfig from idp_common.config.schema_constants import ( X_AWS_IDP_CLASSIFICATION, + X_AWS_IDP_DOCUMENT_NAME_REGEX, X_AWS_IDP_DOCUMENT_TYPE, + X_AWS_IDP_PAGE_CONTENT_REGEX, ) from idp_common.models import Document, Section, Status from idp_common.utils import extract_json_from_text, extract_structured_data_from_text @@ -63,11 +65,11 @@ class ClassificationService: def __init__( self, - region: str = None, + region: str | None = None, max_workers: int = 20, - config: Union[Dict[str, Any], IDPConfig] = None, + config: dict[str, Any] | IDPConfig | None = None, backend: str = "bedrock", - cache_table: str = None, + cache_table: str | None = None, ): """ Initialize the classification service. @@ -105,7 +107,7 @@ def __init__( self.cache_table = None if self.cache_table_name: dynamodb = boto3.resource("dynamodb", region_name=self.region) - self.cache_table = dynamodb.Table(self.cache_table_name) + self.cache_table = dynamodb.Table(self.cache_table_name) # pyright: ignore[reportAttributeAccessIssue] logger.info( f"Classification caching enabled using table: {self.cache_table_name}" ) @@ -168,14 +170,21 @@ def _load_document_types(self) -> List[DocumentType]: classes = self.config.classes for schema in classes: classification_meta = schema.get(X_AWS_IDP_CLASSIFICATION, {}) + + # Support both new top-level format and legacy nested format for regex patterns + document_name_regex = schema.get( + X_AWS_IDP_DOCUMENT_NAME_REGEX + ) or classification_meta.get("documentNamePattern") + document_page_content_regex = schema.get( + X_AWS_IDP_PAGE_CONTENT_REGEX + ) or classification_meta.get("pageContentPattern") + doc_types.append( DocumentType( type_name=schema.get(X_AWS_IDP_DOCUMENT_TYPE, ""), description=schema.get("description", ""), - document_name_regex=classification_meta.get("documentNamePattern"), - document_page_content_regex=classification_meta.get( - "pageContentPattern" - ), + document_name_regex=document_name_regex, + document_page_content_regex=document_page_content_regex, ) ) @@ -242,8 +251,9 @@ def _limit_pages_for_classification(self, document: Document) -> Document: # Create limited document limited_pages = {pid: document.pages[pid] for pid in limited_page_ids} + document_id = document.id if document.id else "" limited_document = Document( - id=document.id + f"_limited_{max_pages}", + id=document_id + f"_limited_{max_pages}", pages=limited_pages, status=document.status, workflow_execution_arn=document.workflow_execution_arn, @@ -301,8 +311,17 @@ def _apply_limited_classification_to_all_pages( doc_type=primary_classification, pages=list(original_document.pages.keys()), ) - original_document.sections = [section] - + if isinstance(section, Section): + original_document.sections = [section] + else: + # Handle DocumentSection - convert to Section + original_document.sections = [ + Section( + section_id=section.section_id, + classification=section.classification.doc_type, + page_ids=[page.page_id for page in section.pages], + ) + ] # Transfer metering data from classified document to original document if classified_document.metering: original_document.metering = utils.merge_metering_data( @@ -357,18 +376,11 @@ def _classify_pages_multimodal(self, document: Document) -> Document: page_id ].confidence = cached_result.classification.confidence - # Copy metadata (including boundary information) to the page - if hasattr(document.pages[page_id], "metadata"): - document.pages[ - page_id - ].metadata = cached_result.classification.metadata - else: - # If the page doesn't have a metadata attribute, add it - setattr( - document.pages[page_id], - "metadata", - cached_result.classification.metadata, - ) + setattr( + document.pages[page_id], + "metadata", + cached_result.classification.metadata, + ) # Merge cached metering data page_metering = cached_result.classification.metadata.get( @@ -419,17 +431,11 @@ def _classify_pages_multimodal(self, document: Document) -> Document: ].confidence = page_result.classification.confidence # Copy metadata (including boundary information) to the page - if hasattr(document.pages[page_id], "metadata"): - document.pages[ - page_id - ].metadata = page_result.classification.metadata - else: - # If the page doesn't have a metadata attribute, add it - setattr( - document.pages[page_id], - "metadata", - page_result.classification.metadata, - ) + setattr( + document.pages[page_id], + "metadata", + page_result.classification.metadata, + ) # Merge metering data page_metering = page_result.classification.metadata.get( @@ -528,7 +534,17 @@ def _classify_pages_multimodal(self, document: Document) -> Document: doc_type=current_type, pages=[p.page_id for p in current_pages], ) - document.sections.append(section) + + if isinstance(section, Section): + document.sections.append(section) + else: + document.sections.append( + Section( + section_id=section.section_id, + classification=section.classification.doc_type, + page_ids=[page.page_id for page in section.pages], + ) + ) # Start a new group current_group += 1 @@ -541,7 +557,17 @@ def _classify_pages_multimodal(self, document: Document) -> Document: doc_type=current_type, pages=[p.page_id for p in current_pages], ) - document.sections.append(section) + + if isinstance(section, Section): + document.sections.append(section) + else: + document.sections.append( + Section( + section_id=section.section_id, + classification=section.classification.doc_type, + page_ids=[page.page_id for page in section.pages], + ) + ) # Update document status and metering document = self._update_document_status(document) @@ -640,8 +666,8 @@ def _get_classification_config(self) -> Dict[str, Any]: def _prepare_prompt_from_template( self, prompt_template: str, - substitutions: Dict[str, str], - required_placeholders: List[str] = None, + substitutions: dict[str, str], + required_placeholders: list[str] | None = None, ) -> str: """ Prepare prompt from template by replacing placeholders with values. @@ -1328,7 +1354,11 @@ def _get_cache_key(self, document: Document) -> str: Returns: Cache key string """ - workflow_id = document.workflow_execution_arn.split(":")[-1] + workflow_id = ( + document.workflow_execution_arn.split(":")[-1] + if document.workflow_execution_arn + else "unknown" + ) return f"classcache#{document.id}#{workflow_id}" def _get_cached_page_classifications( @@ -1517,7 +1547,17 @@ def classify_document(self, document: Document) -> Document: pages=page_ids, confidence=1.0, ) - document.sections = [section] + + if isinstance(section, Section): + document.sections = [section] + else: + document.sections = [ + Section( + section_id=section.section_id, + classification=section.classification.doc_type, + page_ids=[page.page_id for page in section.pages], + ) + ] # Update document status document = self._update_document_status(document) @@ -1540,11 +1580,23 @@ def classify_document(self, document: Document) -> Document: page_ids = list(document.pages.keys()) section = self._create_section( section_id="1", - doc_type=self.single_class_name, + doc_type=self.single_class_name + if self.single_class_name + else "undefined", pages=page_ids, confidence=1.0, ) - document.sections = [section] + + if isinstance(section, Section): + document.sections = [section] + else: + document.sections = [ + Section( + section_id=section.section_id, + classification=section.classification.doc_type, + page_ids=[page.page_id for page in section.pages], + ) + ] # Update document status document = self._update_document_status(document) @@ -1654,7 +1706,7 @@ def _sort_page_results( return sorted(results, key=lambda x: x.page_id) def _create_section( - self, section_id: str, doc_type: str, pages: List, confidence: float = 1.0 + self, section_id: str, doc_type: str, pages: List[Any], confidence: float = 1.0 ) -> Union[DocumentSection, Section]: """ Create a document section based on the input type. @@ -1869,7 +1921,9 @@ def holistic_classify_document(self, document: Document) -> Document: page_ids = list(document.pages.keys()) section = Section( section_id="1", - classification=self.single_class_name, + classification=self.single_class_name + if self.single_class_name + else "undefined", confidence=1.0, page_ids=page_ids, ) diff --git a/lib/idp_common_pkg/idp_common/config/migration.py b/lib/idp_common_pkg/idp_common/config/migration.py index 6fe8d81fc..53a3f2772 100644 --- a/lib/idp_common_pkg/idp_common/config/migration.py +++ b/lib/idp_common_pkg/idp_common/config/migration.py @@ -24,6 +24,8 @@ X_AWS_IDP_CLASS_PROMPT, X_AWS_IDP_ATTRIBUTES_PROMPT, X_AWS_IDP_IMAGE_PATH, + X_AWS_IDP_DOCUMENT_NAME_REGEX, + X_AWS_IDP_PAGE_CONTENT_REGEX, VALID_EVALUATION_METHODS, MAX_PROMPT_OVERRIDE_LENGTH, # Attribute types (for legacy migration only) @@ -46,6 +48,8 @@ LEGACY_CLASS_PROMPT, LEGACY_ATTRIBUTES_PROMPT, LEGACY_IMAGE_PATH, + LEGACY_DOCUMENT_NAME_REGEX, + LEGACY_DOCUMENT_PAGE_CONTENT_REGEX, ) @@ -153,6 +157,17 @@ def migrate_legacy_to_schema( if LEGACY_EXAMPLES in class_config: migrated_class[X_AWS_IDP_EXAMPLES] = class_config[LEGACY_EXAMPLES] + # Migrate regex patterns if present + if LEGACY_DOCUMENT_NAME_REGEX in class_config: + migrated_class[X_AWS_IDP_DOCUMENT_NAME_REGEX] = class_config[ + LEGACY_DOCUMENT_NAME_REGEX + ] + + if LEGACY_DOCUMENT_PAGE_CONTENT_REGEX in class_config: + migrated_class[X_AWS_IDP_PAGE_CONTENT_REGEX] = class_config[ + LEGACY_DOCUMENT_PAGE_CONTENT_REGEX + ] + legacy_attributes = class_config.get(LEGACY_ATTRIBUTES, []) for attr in legacy_attributes: @@ -456,6 +471,17 @@ def _convert_classes_to_json_schema( ): schema[X_AWS_IDP_EXAMPLES] = doc_type_class[X_AWS_IDP_EXAMPLES] + # Add regex patterns if present + if X_AWS_IDP_DOCUMENT_NAME_REGEX in doc_type_class: + schema[X_AWS_IDP_DOCUMENT_NAME_REGEX] = doc_type_class[ + X_AWS_IDP_DOCUMENT_NAME_REGEX + ] + + if X_AWS_IDP_PAGE_CONTENT_REGEX in doc_type_class: + schema[X_AWS_IDP_PAGE_CONTENT_REGEX] = doc_type_class[ + X_AWS_IDP_PAGE_CONTENT_REGEX + ] + if defs: schema[DEFS_FIELD] = defs diff --git a/lib/idp_common_pkg/idp_common/config/schema_constants.py b/lib/idp_common_pkg/idp_common/config/schema_constants.py index 85cf02306..8fdf07bf6 100644 --- a/lib/idp_common_pkg/idp_common/config/schema_constants.py +++ b/lib/idp_common_pkg/idp_common/config/schema_constants.py @@ -25,6 +25,10 @@ # Classification metadata for document type X_AWS_IDP_CLASSIFICATION = "x-aws-idp-classification" +# Regex patterns for classification optimization +X_AWS_IDP_DOCUMENT_NAME_REGEX = "x-aws-idp-document-name-regex" +X_AWS_IDP_PAGE_CONTENT_REGEX = "x-aws-idp-document-page-content-regex" + # ============================================================================ # Legacy Attribute Type Values (for migration only) # ============================================================================ @@ -52,8 +56,7 @@ X_AWS_IDP_EVALUATION_METHOD = "x-aws-idp-evaluation-method" - -X_AWS_IDP_EXAMPLES= "x-aws-idp-examples" +X_AWS_IDP_EXAMPLES = "x-aws-idp-examples" # Valid evaluation methods EVALUATION_METHOD_EXACT = "EXACT" @@ -114,6 +117,10 @@ LEGACY_ATTRIBUTES_PROMPT = "attributesPrompt" LEGACY_IMAGE_PATH = "imagePath" +# Legacy regex fields (same name in both legacy and new format) +LEGACY_DOCUMENT_NAME_REGEX = "document_name_regex" +LEGACY_DOCUMENT_PAGE_CONTENT_REGEX = "document_page_content_regex" + # ============================================================================ # JSON Schema Standard Property Names # ============================================================================ diff --git a/lib/idp_common_pkg/tests/unit/test_max_pages_classification.py b/lib/idp_common_pkg/tests/unit/test_max_pages_classification.py index 843324ea1..bb720989b 100644 --- a/lib/idp_common_pkg/tests/unit/test_max_pages_classification.py +++ b/lib/idp_common_pkg/tests/unit/test_max_pages_classification.py @@ -1,11 +1,10 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 -from unittest.mock import Mock, patch import pytest from idp_common.classification.service import ClassificationService -from idp_common.models import Document, Page +from idp_common.models import Document, Page, Section @pytest.mark.unit @@ -99,18 +98,13 @@ def test_apply_limited_classification_single_type(self, classification_service): classified_doc.pages["1"].classification = "invoice" classified_doc.pages["2"].classification = "invoice" - # Mock sections - mock_section = Mock() - mock_section.classification = "invoice" - mock_section.page_ids = ["1", "2"] - classified_doc.sections = [mock_section] + # Create real Section objects + section = Section(section_id="1", classification="invoice", page_ids=["1", "2"]) + classified_doc.sections = [section] - with patch.object(classification_service, "_create_section") as mock_create: - mock_create.return_value = mock_section - - result = classification_service._apply_limited_classification_to_all_pages( - original_doc, classified_doc - ) + result = classification_service._apply_limited_classification_to_all_pages( + original_doc, classified_doc + ) # All pages should be classified as "invoice" assert result.pages["1"].classification == "invoice" @@ -138,23 +132,16 @@ def test_apply_limited_classification_tie_breaker(self, classification_service): classified_doc.pages["1"].classification = "payslip" classified_doc.pages["2"].classification = "drivers_license" - # Mock sections - payslip processed first - mock_section1 = Mock() - mock_section1.classification = "payslip" - mock_section1.page_ids = ["1"] - - mock_section2 = Mock() - mock_section2.classification = "drivers_license" - mock_section2.page_ids = ["2"] - - classified_doc.sections = [mock_section1, mock_section2] - - with patch.object(classification_service, "_create_section") as mock_create: - mock_create.return_value = mock_section1 + # Create real Section objects - payslip processed first + section1 = Section(section_id="1", classification="payslip", page_ids=["1"]) + section2 = Section( + section_id="2", classification="drivers_license", page_ids=["2"] + ) + classified_doc.sections = [section1, section2] - result = classification_service._apply_limited_classification_to_all_pages( - original_doc, classified_doc - ) + result = classification_service._apply_limited_classification_to_all_pages( + original_doc, classified_doc + ) # Should pick "payslip" due to insertion order tie-breaker assert result.pages["1"].classification == "payslip" diff --git a/notebooks/examples/config/classes.yaml b/notebooks/examples/config/classes.yaml index 83f9cfaba..a2d1590a3 100644 --- a/notebooks/examples/config/classes.yaml +++ b/notebooks/examples/config/classes.yaml @@ -1,900 +1,1173 @@ # Document Classes and Attributes Configuration classes: - - name: Payslip - description: >- - An employee wage statement showing earnings, deductions, taxes, and net pay for a specific pay period, - typically issued by employers to document compensation details including gross pay, various tax withholdings, - and year-to-date totals. - attributes: - - name: YTDNetPay - description: >- - Year-to-date net pay amount representing cumulative take-home earnings after all deductions - from the beginning of the year to the current pay period. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: PayPeriodStartDate - description: >- - The beginning date of the pay period covered by this payslip, indicating when the earning - period started for the compensation shown. - evaluation_method: EXACT - attributeType: simple - - name: PayPeriodEndDate - description: >- - The ending date of the pay period covered by this payslip, indicating when the earning - period ended for the compensation shown. - evaluation_method: EXACT - attributeType: simple - - name: PayDate - description: >- - The actual date when the employee was paid, representing when the compensation was issued - or deposited. - evaluation_method: EXACT - attributeType: simple - - name: CurrentGrossPay - description: >- - The total earnings before any deductions for the current pay period, representing gross - compensation for the period. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: YTDGrossPay - description: >- - Year-to-date gross pay representing cumulative earnings before deductions from the - beginning of the year to the current pay period. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: CurrentNetPay - description: >- - The take-home pay after all deductions for the current pay period, representing the - actual amount paid to the employee. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: CurrentTotalDeductions - description: >- - Total amount deducted from gross pay for the current period, including all taxes, - benefits, and other withholdings. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: YTDTotalDeductions - description: >- - Year-to-date total deductions representing cumulative amounts withheld from gross pay - from the beginning of the year to the current pay period. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: RegularHourlyRate - description: >- - The standard hourly wage rate for regular working hours, representing the base - compensation rate for normal work time. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: HolidayHourlyRate - description: >- - The hourly wage rate for holiday work, typically higher than the regular rate to - reflect premium compensation for holiday hours. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: EmployeeNumber - description: >- - The unique identifier assigned to the employee by the employer for payroll and - administrative purposes. - evaluation_method: EXACT - attributeType: simple - - name: PayrollNumber - description: >- - The payroll batch or sequence number for this pay period, used for payroll processing - identification and tracking. - evaluation_method: EXACT - attributeType: simple - - name: FederalFilingStatus - description: >- - The employee's federal tax filing status for withholding purposes, such as Single, - Married Filing Jointly, etc. - evaluation_method: EXACT - attributeType: simple - - name: StateFilingStatus - description: >- - The employee's state tax filing status for withholding purposes, which may differ - from federal filing status based on state requirements. - evaluation_method: EXACT - attributeType: simple - - name: YTDFederalTax - description: >- - Year-to-date federal income tax withheld, representing cumulative federal tax - deductions from the beginning of the year. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: YTDStateTax - description: >- - Year-to-date state income tax withheld, representing cumulative state tax deductions - from the beginning of the year. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: YTDCityTax - description: >- - Year-to-date city or local income tax withheld, representing cumulative local tax - deductions from the beginning of the year. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: currency - description: >- - The currency in which all monetary amounts on the payslip are denominated, typically - represented as a three-letter code like USD, EUR, etc. - evaluation_method: EXACT - attributeType: simple - - name: is_gross_pay_valid - description: >- - A validation flag indicating whether the gross pay calculation is correct and valid - based on payroll system checks. - evaluation_method: EXACT - attributeType: simple - - name: are_field_names_sufficient - description: >- - A validation flag indicating whether the field names on the payslip provide sufficient - information for processing and understanding. - evaluation_method: EXACT - attributeType: simple - - name: is_ytd_gross_pay_highest - description: >- - A validation flag indicating whether the year-to-date gross pay represents the highest - value among pay categories. - evaluation_method: EXACT - attributeType: simple - - name: CompanyAddress - groupAttributes: - - name: State + - $schema: https://json-schema.org/draft/2020-12/schema + $defs: + Address: + type: object + description: >- + The complete business address of the employing company, including + street address, city, state, and postal code information. + properties: + City: + type: string + description: The city portion of the company's business address. + x-aws-idp-evaluation-method: EXACT + Line1: + type: string + description: >- + The primary street address line of the company's business + location. + x-aws-idp-evaluation-method: EXACT + State: + type: string description: The state or province portion of the company's business address. - evaluation_method: EXACT - - name: ZipCode + x-aws-idp-evaluation-method: EXACT + ZipCode: + type: string description: The postal code portion of the company's business address. - evaluation_method: EXACT - - name: City - description: The city portion of the company's business address. - evaluation_method: EXACT - - name: Line1 - description: The primary street address line of the company's business location. - evaluation_method: EXACT - - name: Line2 - description: The secondary address line for the company, such as suite or floor number. - evaluation_method: EXACT - description: >- - The complete business address of the employing company, including street address, - city, state, and postal code information. - evaluation_method: LLM - attributeType: group - - name: EmployeeAddress - groupAttributes: - - name: State - description: The state or province portion of the employee's residential address. - evaluation_method: EXACT - - name: ZipCode - description: The postal code portion of the employee's residential address. - evaluation_method: EXACT - - name: City - description: The city portion of the employee's residential address. - evaluation_method: EXACT - - name: Line1 - description: The primary street address line of the employee's residence. - evaluation_method: EXACT - - name: Line2 - description: The secondary address line for the employee, such as apartment number. - evaluation_method: EXACT - description: >- - The complete residential address of the employee, including street address, city, - state, and postal code information. - evaluation_method: LLM - attributeType: group - - name: EmployeeName - groupAttributes: - - name: FirstName - description: The given name of the employee. - evaluation_method: EXACT - - name: SuffixName + x-aws-idp-evaluation-method: EXACT + Line2: + type: string + description: >- + The secondary address line for the company, such as suite or floor + number. + x-aws-idp-evaluation-method: EXACT + EmployeeName: + type: object + description: >- + The complete name information of the employee, including first name, + middle name, last name, and any suffix. + properties: + SuffixName: + type: string description: Name suffix such as Jr., Sr., III, etc. - evaluation_method: EXACT - - name: LastName + x-aws-idp-evaluation-method: EXACT + LastName: + type: string description: The family name or surname of the employee. - evaluation_method: EXACT - - name: MiddleName + x-aws-idp-evaluation-method: EXACT + MiddleName: + type: string description: The middle name or initial of the employee. - evaluation_method: EXACT - description: >- - The complete name information of the employee, including first name, middle name, - last name, and any suffix. - evaluation_method: LLM - attributeType: group - - name: FederalTaxes - listItemTemplate: - itemAttributes: - - name: YTD - description: Year-to-date amount for this federal tax item. - evaluation_method: NUMERIC_EXACT - - name: Period - description: Current period amount for this federal tax item. - evaluation_method: NUMERIC_EXACT - - name: ItemDescription - description: Description of the specific federal tax type or category. - evaluation_method: EXACT - itemDescription: Each item represents a specific federal tax withholding category - description: >- - List of federal tax withholdings showing different types of federal taxes deducted, - with both current period and year-to-date amounts. - evaluation_method: LLM - attributeType: list - - name: CityTaxes - listItemTemplate: - itemAttributes: - - name: YTD - description: Year-to-date amount for this city tax item. - evaluation_method: NUMERIC_EXACT - - name: Period - description: Current period amount for this city tax item. - evaluation_method: NUMERIC_EXACT - - name: ItemDescription - description: Description of the specific city tax type or jurisdiction. - evaluation_method: EXACT - itemDescription: Each item represents a specific city or local tax withholding - description: >- - List of city or local tax withholdings showing different municipal taxes deducted, - with both current period and year-to-date amounts. - evaluation_method: LLM - attributeType: list - - name: StateTaxes - listItemTemplate: - itemAttributes: - - name: YTD - description: Year-to-date amount for this state tax item. - evaluation_method: NUMERIC_EXACT - - name: Period - description: Current period amount for this state tax item. - evaluation_method: NUMERIC_EXACT - - name: ItemDescription - description: Description of the specific state tax type or category. - evaluation_method: EXACT - itemDescription: Each item represents a specific state tax withholding category - description: >- - List of state tax withholdings showing different types of state taxes deducted, - with both current period and year-to-date amounts. - evaluation_method: LLM - attributeType: list - - - name: US-drivers-licenses + x-aws-idp-evaluation-method: EXACT + FirstName: + type: string + description: The given name of the employee. + x-aws-idp-evaluation-method: EXACT + TaxInfo: + type: object + properties: + YTD: + type: string + description: Year-to-date amount for this federal tax item. + x-aws-idp-evaluation-method: NUMERIC_EXACT + Period: + type: string + description: Current period amount for this federal tax item. + x-aws-idp-evaluation-method: NUMERIC_EXACT + ItemDescription: + type: string + description: Description of the specific federal tax type or category. + x-aws-idp-evaluation-method: EXACT description: >- - An official government-issued identification document that authorizes an individual to operate - motor vehicles, containing personal information, physical characteristics, address details, - and driving privileges with restrictions and endorsements. - attributes: - - name: STATE_NAME - description: >- - The state or jurisdiction that issued the driver's license, typically shown as a - two-letter state abbreviation like MA, CA, NY, etc. - evaluation_method: EXACT - attributeType: simple - - name: ID_NUMBER - description: >- - The unique driver's license identification number assigned by the issuing state, - prominently displayed on the license for identification purposes. - evaluation_method: EXACT - attributeType: simple - - name: EXPIRATION_DATE - description: >- - The date when the driver's license expires and requires renewal, typically in - YYYY-MM-DD format indicating when the license becomes invalid. - evaluation_method: EXACT - attributeType: simple - - name: DATE_OF_ISSUE - description: >- - The date when the driver's license was originally issued by the state authority, - typically in YYYY-MM-DD format showing the license creation date. - evaluation_method: EXACT - attributeType: simple - - name: CLASS - description: >- - The type or category of driving privileges granted by the license, such as Class D - for regular driver's license or other classifications for commercial vehicles. - evaluation_method: EXACT - attributeType: simple - - name: DATE_OF_BIRTH - description: >- - The birth date of the license holder in YYYY-MM-DD format, used for age verification - and identification purposes. - evaluation_method: EXACT - attributeType: simple - - name: COUNTY - description: >- - The county of residence for the license holder, though this field may be empty - if not provided by the issuing jurisdiction. - evaluation_method: EXACT - attributeType: simple - - name: NAME_DETAILS - groupAttributes: - - name: SUFFIX + An employee wage statement showing earnings, deductions, taxes, and net + pay for a specific pay period, typically issued by employers to document + compensation details including gross pay, various tax withholdings, and + year-to-date totals. + type: object + x-aws-idp-document-type: Payslip + properties: + YTDNetPay: + type: number + description: >- + Year-to-date net pay amount representing cumulative take-home earnings + after all deductions from the beginning of the year to the current + pay period. + x-aws-idp-evaluation-method: NUMERIC_EXACT + PayPeriodStartDate: + format: date + description: >- + The beginning date of the pay period covered by this payslip, + indicating when the earning period started for the compensation + shown. + type: string + x-aws-idp-evaluation-method: EXACT + FederalTaxes: + description: >- + List of federal tax withholdings showing different types of federal + taxes deducted, with both current period and year-to-date amounts. + type: array + x-aws-idp-list-item-description: Each item represents a specific federal tax withholding category + items: + $ref: "#/$defs/TaxInfo" + x-aws-idp-evaluation-method: LLM + CurrentGrossPay: + type: number + description: >- + The total earnings before any deductions for the current pay period, + representing gross compensation for the period. + x-aws-idp-evaluation-method: NUMERIC_EXACT + HolidayHourlyRate: + type: number + description: >- + The hourly wage rate for holiday work, typically higher than the + regular rate to reflect premium compensation for holiday hours. + x-aws-idp-evaluation-method: NUMERIC_EXACT + CompanyAddress: + description: >- + The complete business address of the employing company, including + street address, city, state, and postal code information. + $ref: "#/$defs/Address" + x-aws-idp-evaluation-method: LLM + CityTaxes: + description: >- + List of city or local tax withholdings showing different municipal + taxes deducted, with both current period and year-to-date amounts. + type: array + x-aws-idp-list-item-description: Each item represents a specific city or local tax withholding + items: + $ref: "#/$defs/TaxInfo" + x-aws-idp-evaluation-method: LLM + PayPeriodEndDate: + format: date + description: >- + The ending date of the pay period covered by this payslip, indicating + when the earning period ended for the compensation shown. + type: string + x-aws-idp-evaluation-method: EXACT + PayDate: + format: date + description: >- + The actual date when the employee was paid, representing when the + compensation was issued or deposited. + type: string + x-aws-idp-evaluation-method: EXACT + currency: + type: string + description: >- + The currency in which all monetary amounts on the payslip are + denominated, typically represented as a three-letter code like USD, + EUR, etc. + x-aws-idp-evaluation-method: EXACT + YTDGrossPay: + type: number + description: >- + Year-to-date gross pay representing cumulative earnings before + deductions from the beginning of the year to the current pay period. + x-aws-idp-evaluation-method: NUMERIC_EXACT + EmployeeAddress: + description: >- + The complete residential address of the employee, including street + address, city, state, and postal code information. + $ref: "#/$defs/Address" + x-aws-idp-evaluation-method: LLM + is_gross_pay_valid: + type: boolean + description: >- + A validation flag indicating whether the gross pay calculation is + correct and valid based on payroll system checks. + x-aws-idp-evaluation-method: EXACT + StateFilingStatus: + type: string + description: >- + The employee's state tax filing status for withholding purposes, which + may differ from federal filing status based on state requirements. + x-aws-idp-evaluation-method: EXACT + YTDCityTax: + description: >- + Year-to-date city or local income tax withheld, representing + cumulative local tax deductions from the beginning of the year. + $ref: "#/$defs/TaxInfo" + x-aws-idp-evaluation-method: NUMERIC_EXACT + EmployeeNumber: + type: string + description: >- + The unique identifier assigned to the employee by the employer for + payroll and administrative purposes. + x-aws-idp-evaluation-method: EXACT + RegularHourlyRate: + type: number + description: >- + The standard hourly wage rate for regular working hours, representing + the base compensation rate for normal work time. + x-aws-idp-evaluation-method: NUMERIC_EXACT + are_field_names_sufficient: + type: boolean + description: >- + A validation flag indicating whether the field names on the payslip + provide sufficient information for processing and understanding. + x-aws-idp-evaluation-method: EXACT + YTDTotalDeductions: + type: number + description: >- + Year-to-date total deductions representing cumulative amounts withheld + from gross pay from the beginning of the year to the current pay + period. + x-aws-idp-evaluation-method: NUMERIC_EXACT + is_ytd_gross_pay_highest: + type: boolean + description: >- + A validation flag indicating whether the year-to-date gross pay + represents the highest value among pay categories. + x-aws-idp-evaluation-method: EXACT + StateTaxes: + description: >- + List of state tax withholdings showing different types of state taxes + deducted, with both current period and year-to-date amounts. + type: array + x-aws-idp-list-item-description: Each item represents a specific state tax withholding category + items: + $ref: "#/$defs/TaxInfo" + x-aws-idp-evaluation-method: LLM + PayrollNumber: + type: string + description: >- + The payroll batch or sequence number for this pay period, used for + payroll processing identification and tracking. + x-aws-idp-evaluation-method: EXACT + YTDStateTax: + description: >- + Year-to-date state income tax withheld, representing cumulative state + tax deductions from the beginning of the year. + $ref: "#/$defs/TaxInfo" + x-aws-idp-evaluation-method: NUMERIC_EXACT + CurrentTotalDeductions: + type: number + description: >- + Total amount deducted from gross pay for the current period, including + all taxes, benefits, and other withholdings. + x-aws-idp-evaluation-method: NUMERIC_EXACT + FederalFilingStatus: + type: string + description: >- + The employee's federal tax filing status for withholding purposes, + such as Single, Married Filing Jointly, etc. + x-aws-idp-evaluation-method: EXACT + EmployeeName: + description: >- + The complete name information of the employee, including first name, + middle name, last name, and any suffix. + $ref: "#/$defs/EmployeeName" + x-aws-idp-evaluation-method: LLM + CurrentNetPay: + type: number + description: >- + The take-home pay after all deductions for the current pay period, + representing the actual amount paid to the employee. + x-aws-idp-evaluation-method: NUMERIC_EXACT + YTDFederalTax: + description: >- + Year-to-date federal income tax withheld, representing cumulative + federal tax deductions from the beginning of the year. + $ref: "#/$defs/TaxInfo" + x-aws-idp-evaluation-method: NUMERIC_EXACT + required: + - PayDate + - CurrentGrossPay + - YTDGrossPay + - CurrentNetPay + $id: Payslip + - $schema: https://json-schema.org/draft/2020-12/schema + $defs: + Address: + type: object + description: >- + The complete business address of the employing company, including + street address, city, state, and postal code information. + properties: + City: + type: string + description: The city portion of the company's business address. + x-aws-idp-evaluation-method: EXACT + Line1: + type: string + description: >- + The primary street address line of the company's business + location. + x-aws-idp-evaluation-method: EXACT + State: + type: string + description: The state or province portion of the company's business address. + x-aws-idp-evaluation-method: EXACT + ZipCode: + type: string + description: The postal code portion of the company's business address. + x-aws-idp-evaluation-method: EXACT + Line2: + type: string + description: >- + The secondary address line for the company, such as suite or floor + number. + x-aws-idp-evaluation-method: EXACT + EmployeeName: + type: object + description: >- + The complete name information of the employee, including first name, + middle name, last name, and any suffix. + properties: + SuffixName: + type: string description: Name suffix such as Jr., Sr., III, etc. - evaluation_method: EXACT - - name: MIDDLE_NAME - description: The middle name of the license holder. - evaluation_method: EXACT - - name: LAST_NAME - description: The family name or surname of the license holder. - evaluation_method: EXACT - - name: FIRST_NAME - description: The given name of the license holder. - evaluation_method: EXACT - description: >- - Complete name information of the license holder including first name, middle name, - last name, and any suffix, structured for official identification. - evaluation_method: LLM - attributeType: group - - name: PERSONAL_DETAILS - groupAttributes: - - name: SEX - description: The gender of the license holder, typically 'M' for male or 'F' for female. - evaluation_method: EXACT - - name: HAIR_COLOR - description: The color of the license holder's hair, often abbreviated like BLN, BRN, etc. - evaluation_method: EXACT - - name: HEIGHT - description: The physical height of the license holder, often in feet-inches format like '5-10'. - evaluation_method: EXACT - - name: WEIGHT + x-aws-idp-evaluation-method: EXACT + LastName: + type: string + description: The family name or surname of the employee. + x-aws-idp-evaluation-method: EXACT + MiddleName: + type: string + description: The middle name or initial of the employee. + x-aws-idp-evaluation-method: EXACT + FirstName: + type: string + description: The given name of the employee. + x-aws-idp-evaluation-method: EXACT + PERSONAL_DETAILS: + type: object + description: >- + Physical characteristics and personal details of the license holder + used for identification purposes, including gender, height, weight, + and eye/hair color. + properties: + HAIR_COLOR: + type: string + description: >- + The color of the license holder's hair, often abbreviated like + BLN, BRN, etc. + x-aws-idp-evaluation-method: EXACT + HEIGHT: + type: string + description: >- + The physical height of the license holder, often in feet-inches + format like '5-10'. + x-aws-idp-evaluation-method: EXACT + WEIGHT: + type: string description: The weight of the license holder, typically in pounds. - evaluation_method: EXACT - - name: EYE_COLOR - description: The color of the license holder's eyes, often abbreviated like BLU, BRN, GRN, etc. - evaluation_method: EXACT - description: >- - Physical characteristics and personal details of the license holder used for - identification purposes, including gender, height, weight, and eye/hair color. - evaluation_method: LLM - attributeType: group - - name: ADDRESS_DETAILS - groupAttributes: - - name: CITY - description: The city of residence for the license holder. - evaluation_method: EXACT - - name: ZIP_CODE - description: The postal code of the license holder's address. - evaluation_method: EXACT - - name: STATE - description: The state of residence for the license holder, may be abbreviated. - evaluation_method: EXACT - - name: STREET_ADDRESS - description: The street address of the license holder's residence. - evaluation_method: EXACT - description: >- - Complete residential address information of the license holder including street - address, city, state, and postal code. - evaluation_method: LLM - attributeType: group - - name: ENDORSEMENTS - listItemTemplate: - itemAttributes: - - name: endorsement - description: Specific driving endorsement or certification code. - evaluation_method: EXACT - itemDescription: Each item represents a special driving endorsement or certification - description: >- - List of special driving endorsements or certifications held by the license holder, - or 'NONE' if no special endorsements apply. - evaluation_method: LLM - attributeType: list - - name: RESTRICTIONS - listItemTemplate: - itemAttributes: - - name: restriction - description: Specific driving restriction or limitation code. - evaluation_method: EXACT - itemDescription: Each item represents a driving restriction or limitation - description: >- - List of driving restrictions or limitations that apply to the license holder, - or 'NONE' if no restrictions apply. - evaluation_method: LLM - attributeType: list - - - name: Bank-checks + x-aws-idp-evaluation-method: EXACT + SEX: + type: string + description: >- + The gender of the license holder, typically 'M' for male or 'F' + for female. + x-aws-idp-evaluation-method: EXACT + EYE_COLOR: + type: string + description: >- + The color of the license holder's eyes, often abbreviated like + BLU, BRN, GRN, etc. + x-aws-idp-evaluation-method: EXACT description: >- - A written financial instrument directing a bank to pay a specific amount of money from - the account holder's account to a designated payee, containing payment details, account - information, and verification elements. - attributes: - - name: date - description: >- - The date when the check was written, typically handwritten or printed in the - date field of the check. - evaluation_method: EXACT - attributeType: simple - - name: dollar_amount - description: >- - The numerical amount to be paid as specified on the check, typically found in - the amount box on the right side of the check. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: check_number - description: >- - The unique sequential number identifying this specific check, usually found in - the upper right corner and bottom of the check. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: account_holder_name - description: >- - The name of the person or entity who owns the bank account and wrote the check, - typically printed in the upper left corner. - evaluation_method: EXACT - attributeType: simple - - name: payee_name - description: >- - The name of the person or entity receiving the payment, written on the 'Pay to - the order of' line of the check. - evaluation_method: EXACT - attributeType: simple - - name: bank_name - description: >- - The name of the financial institution where the account is held, usually printed - prominently on the check. - evaluation_method: EXACT - attributeType: simple - - name: memo - description: >- - Optional note or reference information written in the memo field, typically in - the lower left area of the check. - evaluation_method: EXACT - attributeType: simple - - name: routing_number_valid - description: >- - A boolean indicator of whether the bank routing number on the check is valid - and properly formatted. - evaluation_method: EXACT - attributeType: simple - - name: bank_routing_number - description: >- - The bank's routing number for electronic transactions, typically found in the - MICR line at the bottom of the check. - evaluation_method: EXACT - attributeType: simple - - name: amount_in_words - description: >- - The payment amount written out in words, typically on the line below the payee - name and ending with 'DOLLARS'. - evaluation_method: EXACT - attributeType: simple - - name: is_signed - description: >- - A boolean indicator of whether the check has been signed by the account holder - in the signature area. - evaluation_method: EXACT - attributeType: simple - - - name: Bank-Statement - description: >- - A periodic financial document issued by banks detailing account activity, balances, - and transactions over a specific time period, providing account holders with a summary - of their financial activity and current account status. - attributes: - - name: account_holder_address - description: >- - The mailing address of the account holder as recorded by the bank, typically - displayed prominently on the statement header. - evaluation_method: EXACT - attributeType: simple - - name: account_number - description: >- - The unique identifier for the bank account, often partially masked for security - purposes on the statement. - evaluation_method: EXACT - attributeType: simple - - name: account_type - description: >- - The category of bank account such as checking, savings, money market, etc., - indicating the type of banking service. - evaluation_method: EXACT - attributeType: simple - - name: statement_end_date - description: >- - The ending date of the statement period in MM/DD/YYYY format, indicating when - the reporting period concluded. - evaluation_method: EXACT - attributeType: simple - - name: statement_start_date - description: >- - The beginning date of the statement period in MM/DD/YYYY format, indicating - when the reporting period began. - evaluation_method: EXACT - attributeType: simple - - name: account_holder_name - description: >- - The name of the person or entity who owns the bank account, as registered - with the financial institution. - evaluation_method: EXACT - attributeType: simple - - name: branch_transit_number - description: >- - The specific branch identifier or transit number associated with the account, - used for routing and identification purposes. - evaluation_method: EXACT - attributeType: simple - - name: bank_name - description: >- - The name of the financial institution issuing the statement, typically displayed - prominently at the top of the document. - evaluation_method: EXACT - attributeType: simple - - name: account_summary - listItemTemplate: - itemAttributes: - - name: summary_desc - description: Description of the account summary item, such as opening balance or closing balance. - evaluation_method: EXACT - - name: summary_amount - description: The monetary amount associated with this summary item. - evaluation_method: NUMERIC_EXACT - itemDescription: Each item represents a key account balance or summary figure - description: >- - Summary of key account information including opening balance, closing balance, - and other important account totals for the statement period. - evaluation_method: LLM - attributeType: list - - name: transaction_details - listItemTemplate: - itemAttributes: - - name: date - description: The date when the transaction occurred. - evaluation_method: EXACT - - name: balance - description: The account balance after this transaction. - evaluation_method: NUMERIC_EXACT - - name: description - description: Description of the transaction or merchant information. - evaluation_method: EXACT - - name: deposits - description: Amount deposited or credited to the account. - evaluation_method: NUMERIC_EXACT - - name: withdrawals - description: Amount withdrawn or debited from the account. - evaluation_method: NUMERIC_EXACT - itemDescription: Each item represents an individual transaction record - description: >- - Detailed listing of all transactions that occurred during the statement period, - including deposits, withdrawals, and resulting account balances. - evaluation_method: LLM - attributeType: list - - - name: W2 + An official government-issued identification document that authorizes an + individual to operate motor vehicles, containing personal information, + physical characteristics, address details, and driving privileges with + restrictions and endorsements. + type: object + x-aws-idp-document-type: US-drivers-licenses + properties: + STATE_NAME: + type: string + description: >- + The state or jurisdiction that issued the driver's license, typically + shown as a two-letter state abbreviation like MA, CA, NY, etc. + x-aws-idp-evaluation-method: EXACT + NAME_DETAILS: + description: >- + Complete name information of the license holder including first name, + middle name, last name, and any suffix, structured for official + identification. + $ref: "#/$defs/EmployeeName" + x-aws-idp-evaluation-method: LLM + ID_NUMBER: + type: string + description: >- + The unique driver's license identification number assigned by the + issuing state, prominently displayed on the license for + identification purposes. + x-aws-idp-evaluation-method: EXACT + EXPIRATION_DATE: + type: string + description: >- + The date when the driver's license expires and requires renewal, + typically in YYYY-MM-DD format indicating when the license becomes + invalid. + x-aws-idp-evaluation-method: EXACT + ENDORSEMENTS: + description: >- + List of special driving endorsements or certifications held by the + license holder, or 'NONE' if no special endorsements apply. + type: array + x-aws-idp-list-item-description: Each item represents a special driving endorsement or certification + items: + description: Specific driving endorsement or certification code. + type: string + x-aws-idp-original-name: endorsement + x-aws-idp-evaluation-method: EXACT + x-aws-idp-evaluation-method: LLM + PERSONAL_DETAILS: + description: >- + Physical characteristics and personal details of the license holder + used for identification purposes, including gender, height, weight, + and eye/hair color. + $ref: "#/$defs/PERSONAL_DETAILS" + x-aws-idp-evaluation-method: LLM + RESTRICTIONS: + description: >- + List of driving restrictions or limitations that apply to the license + holder, or 'NONE' if no restrictions apply. + type: array + x-aws-idp-list-item-description: Each item represents a driving restriction or limitation + items: + description: Specific driving restriction or limitation code. + type: string + x-aws-idp-original-name: restriction + x-aws-idp-evaluation-method: EXACT + x-aws-idp-evaluation-method: LLM + CLASS: + type: string + description: >- + The type or category of driving privileges granted by the license, + such as Class D for regular driver's license or other classifications + for commercial vehicles. + x-aws-idp-evaluation-method: EXACT + ADDRESS_DETAILS: + description: >- + Complete residential address information of the license holder + including street address, city, state, and postal code. + $ref: "#/$defs/Address" + x-aws-idp-evaluation-method: LLM + DATE_OF_BIRTH: + type: string + description: >- + The birth date of the license holder in YYYY-MM-DD format, used for + age verification and identification purposes. + x-aws-idp-evaluation-method: EXACT + DATE_OF_ISSUE: + type: string + description: >- + The date when the driver's license was originally issued by the state + authority, typically in YYYY-MM-DD format showing the license + creation date. + x-aws-idp-evaluation-method: EXACT + COUNTY: + type: string + description: >- + The county of residence for the license holder, though this field may + be empty if not provided by the issuing jurisdiction. + x-aws-idp-evaluation-method: EXACT + $id: US-drivers-licenses + - description: >- + A written financial instrument directing a bank to pay a specific amount + of money from the account holder's account to a designated payee, + containing payment details, account information, and verification + elements. + $schema: https://json-schema.org/draft/2020-12/schema + type: object + x-aws-idp-document-type: Bank-checks + properties: + date: + format: date + description: >- + The date when the check was written, typically handwritten or printed + in the date field of the check. + type: string + x-aws-idp-evaluation-method: EXACT + dollar_amount: + type: number + description: >- + The numerical amount to be paid as specified on the check, typically + found in the amount box on the right side of the check. + x-aws-idp-evaluation-method: NUMERIC_EXACT + check_number: + type: string + description: >- + The unique sequential number identifying this specific check, usually + found in the upper right corner and bottom of the check. + x-aws-idp-evaluation-method: NUMERIC_EXACT + account_holder_name: + type: string + description: >- + The name of the person or entity who owns the bank account and wrote + the check, typically printed in the upper left corner. + x-aws-idp-evaluation-method: EXACT + payee_name: + type: string + description: >- + The name of the person or entity receiving the payment, written on the + 'Pay to the order of' line of the check. + x-aws-idp-evaluation-method: EXACT + bank_name: + type: string + description: >- + The name of the financial institution where the account is held, + usually printed prominently on the check. + x-aws-idp-evaluation-method: EXACT + memo: + type: string + description: >- + Optional note or reference information written in the memo field, + typically in the lower left area of the check. + x-aws-idp-evaluation-method: EXACT + routing_number_valid: + type: string + description: >- + A boolean indicator of whether the bank routing number on the check is + valid and properly formatted. + x-aws-idp-evaluation-method: EXACT + bank_routing_number: + type: string + description: >- + The bank's routing number for electronic transactions, typically found + in the MICR line at the bottom of the check. + x-aws-idp-evaluation-method: EXACT + amount_in_words: + type: string + description: >- + The payment amount written out in words, typically on the line below + the payee name and ending with 'DOLLARS'. + x-aws-idp-evaluation-method: EXACT + is_signed: + type: boolean + description: >- + A boolean indicator of whether the check has been signed by the + account holder in the signature area. + x-aws-idp-evaluation-method: EXACT + $id: Bank-checks + - $schema: https://json-schema.org/draft/2020-12/schema + $defs: + Address: + type: object + description: >- + The complete business address of the employing company, including + street address, city, state, and postal code information. + properties: + City: + type: string + description: The city portion of the company's business address. + x-aws-idp-evaluation-method: EXACT + Line1: + type: string + description: >- + The primary street address line of the company's business + location. + x-aws-idp-evaluation-method: EXACT + State: + type: string + description: The state or province portion of the company's business address. + x-aws-idp-evaluation-method: EXACT + ZipCode: + type: string + description: The postal code portion of the company's business address. + x-aws-idp-evaluation-method: EXACT + Line2: + type: string + description: >- + The secondary address line for the company, such as suite or floor + number. + x-aws-idp-evaluation-method: EXACT + account_summaryItem: + type: object + properties: + summary_desc: + type: string + description: >- + Description of the account summary item, such as opening balance + or closing balance. + x-aws-idp-evaluation-method: EXACT + summary_amount: + type: string + description: The monetary amount associated with this summary item. + x-aws-idp-evaluation-method: NUMERIC_EXACT + transaction_detail: + type: object + properties: + date: + type: string + description: The date when the transaction occurred. + x-aws-idp-evaluation-method: EXACT + description: + type: string + description: Description of the transaction or merchant information. + x-aws-idp-evaluation-method: EXACT + balance: + type: string + description: The account balance after this transaction. + x-aws-idp-evaluation-method: NUMERIC_EXACT + deposits: + type: string + description: Amount deposited or credited to the account. + x-aws-idp-evaluation-method: NUMERIC_EXACT + withdrawals: + type: string + description: Amount withdrawn or debited from the account. + x-aws-idp-evaluation-method: NUMERIC_EXACT description: >- - An annual tax document provided by employers to employees reporting wages earned and - taxes withheld during the tax year for federal and state income tax filing purposes, - containing comprehensive compensation and withholding information. - attributes: - - name: other - description: >- - Other compensation or benefits not covered in standard W2 boxes, representing - additional taxable or non-taxable benefits provided to the employee. - evaluation_method: EXACT - attributeType: simple - - name: nonqualified_plans_incom - description: >- - Income from nonqualified deferred compensation plans, representing distributions - or benefits from employer-sponsored retirement or compensation plans. - evaluation_method: NUMERIC_EXACT - attributeType: simple - - name: employer_info - groupAttributes: - - name: employer_address + A periodic financial document issued by banks detailing account activity, + balances, and transactions over a specific time period, providing account + holders with a summary of their financial activity and current account + status. + type: object + x-aws-idp-document-type: Bank-Statement + properties: + account_holder_address: + description: >- + The mailing address of the account holder as recorded by the bank, + typically displayed prominently on the statement header. + $ref: "#/$defs/Address" + x-aws-idp-evaluation-method: EXACT + account_number: + type: string + description: >- + The unique identifier for the bank account, often partially masked for + security purposes on the statement. + x-aws-idp-evaluation-method: EXACT + account_type: + type: string + description: >- + The category of bank account such as checking, savings, money market, + etc., indicating the type of banking service. + x-aws-idp-evaluation-method: EXACT + account_summary: + description: >- + Summary of key account information including opening balance, closing + balance, and other important account totals for the statement period. + type: array + x-aws-idp-list-item-description: Each item represents a key account balance or summary figure + items: + $ref: "#/$defs/account_summaryItem" + x-aws-idp-evaluation-method: LLM + statement_end_date: + format: date + description: >- + The ending date of the statement period in MM/DD/YYYY format, + indicating when the reporting period concluded. + type: string + x-aws-idp-evaluation-method: EXACT + statement_start_date: + format: date + description: >- + The beginning date of the statement period in MM/DD/YYYY format, + indicating when the reporting period began. + type: string + x-aws-idp-evaluation-method: EXACT + account_holder_name: + type: string + description: >- + The name of the person or entity who owns the bank account, as + registered with the financial institution. + x-aws-idp-evaluation-method: EXACT + branch_transit_number: + type: string + description: >- + The specific branch identifier or transit number associated with the + account, used for routing and identification purposes. + x-aws-idp-evaluation-method: EXACT + bank_name: + type: string + description: >- + The name of the financial institution issuing the statement, typically + displayed prominently at the top of the document. + x-aws-idp-evaluation-method: EXACT + transaction_details: + description: >- + Detailed listing of all transactions that occurred during the + statement period, including deposits, withdrawals, and resulting + account balances. + type: array + x-aws-idp-list-item-description: Each item represents an individual transaction record + items: + $ref: "#/$defs/transaction_detail" + x-aws-idp-evaluation-method: LLM + $id: Bank-Statement + - $schema: https://json-schema.org/draft/2020-12/schema + $defs: + employer_info: + type: object + description: >- + Complete information about the employing organization including name, + address, tax identification numbers, and control numbers for + processing. + properties: + control_number: + type: string + description: >- + A unique identifier assigned by the employer for tracking and + processing purposes. + x-aws-idp-evaluation-method: EXACT + employer_address: + type: string description: The complete business address of the employing company. - evaluation_method: EXACT - - name: control_number - description: A unique identifier assigned by the employer for tracking and processing purposes. - evaluation_method: EXACT - - name: employer_name + x-aws-idp-evaluation-method: EXACT + employer_name: + type: string description: The legal name of the employing company or organization. - evaluation_method: EXACT - - name: ein + x-aws-idp-evaluation-method: EXACT + ein: + type: string description: The federal Employer Identification Number of the company. - evaluation_method: EXACT - - name: employer_zip_code + x-aws-idp-evaluation-method: EXACT + employer_zip_code: + type: string description: The postal code portion of the employer's address. - evaluation_method: EXACT - description: >- - Complete information about the employing organization including name, address, - tax identification numbers, and control numbers for processing. - evaluation_method: LLM - attributeType: group - - name: filing_info - groupAttributes: - - name: omb_number - description: The Office of Management and Budget form number for the W2 form. - evaluation_method: EXACT - - name: verification_code + x-aws-idp-evaluation-method: EXACT + filing_info: + type: object + description: >- + Official form identification and verification information including + OMB numbers and validation codes for the W2 document. + properties: + verification_code: + type: string description: A verification code used to validate the authenticity of the form. - evaluation_method: EXACT - description: >- - Official form identification and verification information including OMB numbers - and validation codes for the W2 document. - evaluation_method: LLM - attributeType: group - - name: federal_tax_info - groupAttributes: - - name: federal_income_tax + x-aws-idp-evaluation-method: EXACT + omb_number: + type: string + description: The Office of Management and Budget form number for the W2 form. + x-aws-idp-evaluation-method: EXACT + code: + type: object + properties: + amount: + type: string + description: The monetary amount associated with this compensation code. + x-aws-idp-evaluation-method: NUMERIC_EXACT + code: + type: string + description: The letter code representing the type of compensation or benefit. + x-aws-idp-evaluation-method: EXACT + federal_tax_info: + type: object + description: >- + Federal tax withholding information including income tax, Social + Security tax, Medicare tax, and allocated tips for federal tax + reporting. + properties: + federal_income_tax: + type: string description: The amount of federal income tax withheld from the employee's pay. - evaluation_method: NUMERIC_EXACT - - name: allocated_tips + x-aws-idp-evaluation-method: NUMERIC_EXACT + allocated_tips: + type: string description: Tips allocated by the employer to the employee for tax purposes. - evaluation_method: NUMERIC_EXACT - - name: social_security_tax - description: The amount of Social Security tax withheld from the employee's pay. - evaluation_method: NUMERIC_EXACT - - name: medicare_tax + x-aws-idp-evaluation-method: NUMERIC_EXACT + social_security_tax: + type: string + description: >- + The amount of Social Security tax withheld from the employee's + pay. + x-aws-idp-evaluation-method: NUMERIC_EXACT + medicare_tax: + type: string description: The amount of Medicare tax withheld from the employee's pay. - evaluation_method: NUMERIC_EXACT - description: >- - Federal tax withholding information including income tax, Social Security tax, - Medicare tax, and allocated tips for federal tax reporting. - evaluation_method: LLM - attributeType: group - - name: employee_general_info - groupAttributes: - - name: employee_name_suffix - description: Name suffix of the employee such as Jr., Sr., III, etc. - evaluation_method: EXACT - - name: employee_address - description: The complete residential address of the employee. - evaluation_method: EXACT - - name: employee_last_name + x-aws-idp-evaluation-method: NUMERIC_EXACT + employee_general_info: + type: object + description: >- + Complete personal information about the employee including full name, + address, and Social Security Number for tax identification purposes. + properties: + employee_last_name: + type: string description: The family name or surname of the employee. - evaluation_method: EXACT - - name: employee_zip_code + x-aws-idp-evaluation-method: EXACT + employee_name_suffix: + type: string + description: Name suffix of the employee such as Jr., Sr., III, etc. + x-aws-idp-evaluation-method: EXACT + employee_zip_code: + type: string description: The postal code portion of the employee's address. - evaluation_method: EXACT - - name: first_name + x-aws-idp-evaluation-method: EXACT + employee_address: + type: string + description: The complete residential address of the employee. + x-aws-idp-evaluation-method: EXACT + first_name: + type: string description: The given name of the employee. - evaluation_method: EXACT - - name: ssn + x-aws-idp-evaluation-method: EXACT + ssn: + type: string description: The Social Security Number of the employee. - evaluation_method: EXACT - description: >- - Complete personal information about the employee including full name, address, - and Social Security Number for tax identification purposes. - evaluation_method: LLM - attributeType: group - - name: federal_wage_info - groupAttributes: - - name: social_security_tips + x-aws-idp-evaluation-method: EXACT + federal_wage_info: + type: object + description: >- + Federal wage and compensation information including total wages, tips, + and amounts subject to Social Security and Medicare taxes. + properties: + social_security_tips: + type: string description: Tips subject to Social Security tax reporting. - evaluation_method: NUMERIC_EXACT - - name: wages_tips_other_compensation + x-aws-idp-evaluation-method: NUMERIC_EXACT + wages_tips_other_compensation: + type: string description: Total wages, tips, and other compensation paid to the employee. - evaluation_method: NUMERIC_EXACT - - name: medicare_wages_tips + x-aws-idp-evaluation-method: NUMERIC_EXACT + medicare_wages_tips: + type: string description: Wages and tips subject to Medicare tax. - evaluation_method: NUMERIC_EXACT - - name: social_security_wages + x-aws-idp-evaluation-method: NUMERIC_EXACT + social_security_wages: + type: string description: Wages subject to Social Security tax. - evaluation_method: NUMERIC_EXACT - description: >- - Federal wage and compensation information including total wages, tips, and - amounts subject to Social Security and Medicare taxes. - evaluation_method: LLM - attributeType: group - - name: codes - listItemTemplate: - itemAttributes: - - name: amount - description: The monetary amount associated with this compensation code. - evaluation_method: NUMERIC_EXACT - - name: code - description: The letter code representing the type of compensation or benefit. - evaluation_method: EXACT - itemDescription: Each item represents a specific type of compensation or benefit with its corresponding code - description: >- - Additional compensation codes and amounts representing various types of benefits, - deferred compensation, or other taxable/non-taxable items. - evaluation_method: LLM - attributeType: list - - name: state_taxes_table - listItemTemplate: - itemAttributes: - - name: state_name - description: The name of the state for tax reporting purposes. - evaluation_method: EXACT - - name: local_wages_tips - description: Wages and tips subject to local income tax. - evaluation_method: NUMERIC_EXACT - - name: employer_state_id_number - description: The employer's state identification number for this jurisdiction. - evaluation_method: NUMERIC_EXACT - - name: state_wages_and_tips - description: Wages and tips subject to state income tax. - evaluation_method: NUMERIC_EXACT - - name: state_income_tax - description: State income tax withheld for this jurisdiction. - evaluation_method: NUMERIC_EXACT - - name: local_income_tax - description: Local income tax withheld for this jurisdiction. - evaluation_method: NUMERIC_EXACT - - name: locality_name - description: The name of the local jurisdiction for local tax reporting. - evaluation_method: EXACT - itemDescription: Each item represents state and local tax information for a specific jurisdiction - description: >- - State and local tax information including wages subject to tax, taxes withheld, - and jurisdiction details for state and local tax reporting. - evaluation_method: LLM - attributeType: list - - - name: Homeowners-Insurance-Application + x-aws-idp-evaluation-method: NUMERIC_EXACT + state_taxes_tableItem: + type: object + properties: + state_name: + type: string + description: The name of the state for tax reporting purposes. + x-aws-idp-evaluation-method: EXACT + local_wages_tips: + type: string + description: Wages and tips subject to local income tax. + x-aws-idp-evaluation-method: NUMERIC_EXACT + employer_state_id_number: + type: string + description: The employer's state identification number for this jurisdiction. + x-aws-idp-evaluation-method: NUMERIC_EXACT + state_wages_and_tips: + type: string + description: Wages and tips subject to state income tax. + x-aws-idp-evaluation-method: NUMERIC_EXACT + state_income_tax: + type: string + description: State income tax withheld for this jurisdiction. + x-aws-idp-evaluation-method: NUMERIC_EXACT + local_income_tax: + type: string + description: Local income tax withheld for this jurisdiction. + x-aws-idp-evaluation-method: NUMERIC_EXACT + locality_name: + type: string + description: The name of the local jurisdiction for local tax reporting. + x-aws-idp-evaluation-method: EXACT description: >- - An application form for homeowners insurance coverage containing applicant personal information, - property details, coverage requirements, existing insurance history, and underwriting data - necessary for evaluating risk and determining appropriate coverage terms. - attributes: - - name: Expiration Date - description: >- - The date when the insurance policy expires and requires renewal, indicating when - coverage will terminate if not renewed. - evaluation_method: EXACT - attributeType: simple - - name: Purchase Date and Time - description: >- - The specific date and time when the insurance policy was purchased, including both - date and time components for precise transaction recording. - evaluation_method: EXACT - attributeType: simple - - name: Policy Number - description: >- - The unique identifier assigned to the insurance policy for tracking and reference - purposes throughout the policy lifecycle. - evaluation_method: EXACT - attributeType: simple - - name: Named Insured(s) and Mailing Address - description: >- - The complete name and mailing address of the primary insured party, representing - the policyholder and their contact information. - evaluation_method: EXACT - attributeType: simple - - name: Insurance Company - description: >- - The name and address of the insurance provider issuing the policy, including - complete company contact information. - evaluation_method: EXACT - attributeType: simple - - name: Insured Property - description: >- - The complete address of the property being insured, representing the physical - location covered by the homeowners insurance policy. - evaluation_method: EXACT - attributeType: simple - - name: Primary Phone number - description: >- - The main contact phone number for the policyholder, used for communication - regarding the insurance policy and claims. - evaluation_method: EXACT - attributeType: simple - - name: Effective Date - description: >- - The date when the insurance coverage begins and becomes active, marking the - start of the policy period. - evaluation_method: EXACT - attributeType: simple - - name: Primary Email - description: >- - The main email address for the policyholder, used for electronic communication - regarding policy matters and updates. - evaluation_method: EXACT - attributeType: simple - - name: Alternate Phone number - description: >- - The secondary contact phone number for the policyholder, providing an alternative - method of communication for policy-related matters. - evaluation_method: EXACT - attributeType: simple - - name: Co-Applicant Information - groupAttributes: - - name: Drivers License Number + An annual tax document provided by employers to employees reporting wages + earned and taxes withheld during the tax year for federal and state + income tax filing purposes, containing comprehensive compensation and + withholding information. + type: object + x-aws-idp-document-type: W2 + properties: + employer_info: + description: >- + Complete information about the employing organization including name, + address, tax identification numbers, and control numbers for + processing. + $ref: "#/$defs/employer_info" + x-aws-idp-evaluation-method: LLM + filing_info: + description: >- + Official form identification and verification information including + OMB numbers and validation codes for the W2 document. + $ref: "#/$defs/filing_info" + x-aws-idp-evaluation-method: LLM + codes: + description: >- + Additional compensation codes and amounts representing various types + of benefits, deferred compensation, or other taxable/non-taxable + items. + type: array + x-aws-idp-list-item-description: >- + Each item represents a specific type of compensation or benefit with + its corresponding code + items: + $ref: "#/$defs/code" + x-aws-idp-evaluation-method: LLM + other: + type: string + description: >- + Other compensation or benefits not covered in standard W2 boxes, + representing additional taxable or non-taxable benefits provided to + the employee. + x-aws-idp-evaluation-method: EXACT + federal_tax_info: + description: >- + Federal tax withholding information including income tax, Social + Security tax, Medicare tax, and allocated tips for federal tax + reporting. + $ref: "#/$defs/federal_tax_info" + x-aws-idp-evaluation-method: LLM + state_taxes_table: + description: >- + State and local tax information including wages subject to tax, taxes + withheld, and jurisdiction details for state and local tax reporting. + type: array + x-aws-idp-list-item-description: >- + Each item represents state and local tax information for a specific + jurisdiction + items: + $ref: "#/$defs/state_taxes_tableItem" + x-aws-idp-evaluation-method: LLM + employee_general_info: + description: >- + Complete personal information about the employee including full name, + address, and Social Security Number for tax identification purposes. + $ref: "#/$defs/employee_general_info" + x-aws-idp-evaluation-method: LLM + federal_wage_info: + description: >- + Federal wage and compensation information including total wages, tips, + and amounts subject to Social Security and Medicare taxes. + $ref: "#/$defs/federal_wage_info" + x-aws-idp-evaluation-method: LLM + nonqualified_plans_incom: + type: string + description: >- + Income from nonqualified deferred compensation plans, representing + distributions or benefits from employer-sponsored retirement or + compensation plans. + x-aws-idp-evaluation-method: NUMERIC_EXACT + $id: W2 + - $schema: https://json-schema.org/draft/2020-12/schema + $defs: + Auto Claims, Accidents, and Violations: + type: object + description: >- + Comprehensive history of auto claims, accidents, and traffic + violations for underwriting risk assessment purposes. + properties: + Major: + type: string + description: Information about major auto claims or incidents. + x-aws-idp-evaluation-method: EXACT + Number of Comp Claims: + type: string + description: The number of comprehensive auto insurance claims filed. + x-aws-idp-evaluation-method: EXACT + Number of Violations: + type: string + description: The total number of traffic violations. + x-aws-idp-evaluation-method: EXACT + At-Fault: + type: string + description: Details about at-fault accidents or incidents. + x-aws-idp-evaluation-method: EXACT + Number of Auto Accidents: + type: string + description: The total number of auto accidents. + x-aws-idp-evaluation-method: EXACT + Minor: + type: string + description: Information about minor auto claims or incidents. + x-aws-idp-evaluation-method: EXACT + Not-at-Fault: + type: string + description: Details about not-at-fault accidents or incidents. + x-aws-idp-evaluation-method: EXACT + Co-Applicant Information: + type: object + description: >- + Complete information about the co-applicant including personal + details, driver's license information, insurance history, and + relationship to primary applicant. + properties: + Drivers License Number: + type: string description: The driver's license number of the co-applicant. - evaluation_method: EXACT - - name: Length of Time with Current Auto Carrier - description: Duration of relationship with current auto insurance provider for co-applicant. - evaluation_method: EXACT - - name: DL State + x-aws-idp-evaluation-method: EXACT + Length of Time with Current Auto Carrier: + type: string + description: >- + Duration of relationship with current auto insurance provider for + co-applicant. + x-aws-idp-evaluation-method: EXACT + DL State: + type: string description: The state that issued the co-applicant's driver's license. - evaluation_method: EXACT - - name: Education Level + x-aws-idp-evaluation-method: EXACT + Education Level: + type: string description: The highest level of education completed by the co-applicant. - evaluation_method: EXACT - - name: Currently Insured- Auto + x-aws-idp-evaluation-method: EXACT + Currently Insured- Auto: + type: string description: Current auto insurance carrier for the co-applicant. - evaluation_method: EXACT - - name: Length of Time with Prior Auto Carrier - description: Duration of relationship with previous auto insurance provider for co-applicant. - evaluation_method: EXACT - - name: Date of Birth + x-aws-idp-evaluation-method: EXACT + Length of Time with Prior Auto Carrier: + type: string + description: >- + Duration of relationship with previous auto insurance provider for + co-applicant. + x-aws-idp-evaluation-method: EXACT + Date of Birth: + type: string description: The birth date of the co-applicant. - evaluation_method: EXACT - - name: Gender + x-aws-idp-evaluation-method: EXACT + Gender: + type: string description: The gender of the co-applicant. - evaluation_method: EXACT - - name: Marital Status + x-aws-idp-evaluation-method: EXACT + Marital Status: + type: string description: The marital status of the co-applicant. - evaluation_method: EXACT - - name: Relationship to Primary Applicant + x-aws-idp-evaluation-method: EXACT + Relationship to Primary Applicant: + type: string description: The relationship of the co-applicant to the primary policyholder. - evaluation_method: EXACT - - name: Name + x-aws-idp-evaluation-method: EXACT + Name: + type: string description: The full name of the co-applicant. - evaluation_method: EXACT - description: >- - Complete information about the co-applicant including personal details, driver's - license information, insurance history, and relationship to primary applicant. - evaluation_method: LLM - attributeType: group - - name: Auto Claims, Accidents, and Violations - groupAttributes: - - name: Major - description: Information about major auto claims or incidents. - evaluation_method: EXACT - - name: Number of Comp Claims - description: The number of comprehensive auto insurance claims filed. - evaluation_method: EXACT - - name: Number of Violations - description: The total number of traffic violations. - evaluation_method: EXACT - - name: At-Fault - description: Details about at-fault accidents or incidents. - evaluation_method: EXACT - - name: Number of Auto Accidents - description: The total number of auto accidents. - evaluation_method: EXACT - - name: Minor - description: Information about minor auto claims or incidents. - evaluation_method: EXACT - - name: Not-at-Fault - description: Details about not-at-fault accidents or incidents. - evaluation_method: EXACT - description: >- - Comprehensive history of auto claims, accidents, and traffic violations for - underwriting risk assessment purposes. - evaluation_method: LLM - attributeType: group - - name: Primary Applicant Information - groupAttributes: - - name: Type of Current Property Policy - description: The type of current property insurance policy held by primary applicant. - evaluation_method: EXACT - - name: Drivers License Number + x-aws-idp-evaluation-method: EXACT + Primary Applicant Information: + type: object + description: >- + Complete information about the primary applicant including personal + details, driver's license information, insurance history, and + existing policy details. + properties: + Type of Current Property Policy: + type: string + description: >- + The type of current property insurance policy held by primary + applicant. + x-aws-idp-evaluation-method: EXACT + Drivers License Number: + type: string description: The driver's license number of the primary applicant. - evaluation_method: EXACT - - name: Education Level + x-aws-idp-evaluation-method: EXACT + Education Level: + type: string description: The highest level of education completed by the primary applicant. - evaluation_method: EXACT - - name: Currently Insured Auto + x-aws-idp-evaluation-method: EXACT + Currently Insured Auto: + type: string description: Current auto insurance carrier for the primary applicant. - evaluation_method: EXACT - - name: Length of Time with Prior Auto Carrier - description: Duration of relationship with previous auto insurance provider for primary applicant. - evaluation_method: EXACT - - name: Gender + x-aws-idp-evaluation-method: EXACT + Length of Time with Prior Auto Carrier: + type: string + description: >- + Duration of relationship with previous auto insurance provider for + primary applicant. + x-aws-idp-evaluation-method: EXACT + Gender: + type: string description: The gender of the primary applicant. - evaluation_method: EXACT - - name: Marital Status + x-aws-idp-evaluation-method: EXACT + Marital Status: + type: string description: The marital status of the primary applicant. - evaluation_method: EXACT - - name: Name + x-aws-idp-evaluation-method: EXACT + Name: + type: string description: The full name of the primary applicant. - evaluation_method: EXACT - - name: Length of Time with Current Auto Carrier - description: Duration of relationship with current auto insurance provider for primary applicant. - evaluation_method: EXACT - - name: Existing Esurance Policy - description: Existing insurance policy number or reference for primary applicant. - evaluation_method: EXACT - - name: DL State + x-aws-idp-evaluation-method: EXACT + Length of Time with Current Auto Carrier: + type: string + description: >- + Duration of relationship with current auto insurance provider for + primary applicant. + x-aws-idp-evaluation-method: EXACT + Existing Esurance Policy: + type: string + description: >- + Existing insurance policy number or reference for primary + applicant. + x-aws-idp-evaluation-method: EXACT + DL State: + type: string description: The state that issued the primary applicant's driver's license. - evaluation_method: EXACT - - name: Date of Birth + x-aws-idp-evaluation-method: EXACT + Date of Birth: + type: string description: The birth date of the primary applicant. - evaluation_method: EXACT - - name: Years with Prior Property Company - description: Number of years with previous property insurance company for primary applicant. - evaluation_method: EXACT - description: >- - Complete information about the primary applicant including personal details, - driver's license information, insurance history, and existing policy details. - evaluation_method: LLM - attributeType: group \ No newline at end of file + x-aws-idp-evaluation-method: EXACT + Years with Prior Property Company: + type: string + description: >- + Number of years with previous property insurance company for + primary applicant. + x-aws-idp-evaluation-method: EXACT + description: >- + An application form for homeowners insurance coverage containing applicant + personal information, property details, coverage requirements, existing + insurance history, and underwriting data necessary for evaluating risk + and determining appropriate coverage terms. + type: object + x-aws-idp-document-type: Homeowners-Insurance-Application + properties: + Expiration Date: + type: string + description: >- + The date when the insurance policy expires and requires renewal, + indicating when coverage will terminate if not renewed. + x-aws-idp-evaluation-method: EXACT + Purchase Date and Time: + type: string + description: >- + The specific date and time when the insurance policy was purchased, + including both date and time components for precise transaction + recording. + x-aws-idp-evaluation-method: EXACT + Policy Number: + type: string + description: >- + The unique identifier assigned to the insurance policy for tracking + and reference purposes throughout the policy lifecycle. + x-aws-idp-evaluation-method: EXACT + Alternate Phone number: + type: string + description: >- + The secondary contact phone number for the policyholder, providing an + alternative method of communication for policy-related matters. + x-aws-idp-evaluation-method: EXACT + Named Insured(s) and Mailing Address: + type: string + description: >- + The complete name and mailing address of the primary insured party, + representing the policyholder and their contact information. + x-aws-idp-evaluation-method: EXACT + Insurance Company: + type: string + description: >- + The name and address of the insurance provider issuing the policy, + including complete company contact information. + x-aws-idp-evaluation-method: EXACT + Co-Applicant Information: + description: >- + Complete information about the co-applicant including personal + details, driver's license information, insurance history, and + relationship to primary applicant. + $ref: "#/$defs/Co-Applicant Information" + x-aws-idp-evaluation-method: LLM + Insured Property: + type: string + description: >- + The complete address of the property being insured, representing the + physical location covered by the homeowners insurance policy. + x-aws-idp-evaluation-method: EXACT + Primary Phone number: + type: string + description: >- + The main contact phone number for the policyholder, used for + communication regarding the insurance policy and claims. + x-aws-idp-evaluation-method: EXACT + Auto Claims, Accidents, and Violations: + description: >- + Comprehensive history of auto claims, accidents, and traffic + violations for underwriting risk assessment purposes. + $ref: "#/$defs/Auto Claims, Accidents, and Violations" + x-aws-idp-evaluation-method: LLM + Effective Date: + type: string + description: >- + The date when the insurance coverage begins and becomes active, + marking the start of the policy period. + x-aws-idp-evaluation-method: EXACT + Primary Email: + type: string + description: >- + The main email address for the policyholder, used for electronic + communication regarding policy matters and updates. + x-aws-idp-evaluation-method: EXACT + Primary Applicant Information: + description: >- + Complete information about the primary applicant including personal + details, driver's license information, insurance history, and + existing policy details. + $ref: "#/$defs/Primary Applicant Information" + x-aws-idp-evaluation-method: LLM + $id: Homeowners-Insurance-Application0 + diff --git a/notebooks/examples/step2_classification_with_regex.ipynb b/notebooks/examples/step2_classification_with_regex.ipynb index b5b2b9da5..c870ee207 100644 --- a/notebooks/examples/step2_classification_with_regex.ipynb +++ b/notebooks/examples/step2_classification_with_regex.ipynb @@ -24,9 +24,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Libraries loaded and configured\n" + ] + } + ], "source": [ "import os\n", "import json\n", @@ -60,9 +68,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "✅ Loaded document: bank_statement\n", + "✅ Document pages: 6\n", + "✅ Configuration classes: 6\n" + ] + } + ], "source": [ "# Load OCR output from Step 1\n", "examples_dir = Path.cwd()\n", @@ -107,9 +125,40 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:idp_common.classification.service:Classification caching disabled\n", + "INFO:idp_common.classification.service:Initialized classification service with Bedrock backend using model us.amazon.nova-pro-v1:0\n", + "INFO:idp_common.classification.service:Using multimodal page-level classification method with document boundary detection\n", + "INFO:idp_common.classification.service:Document name regex match: 'bank_statement' matched pattern '(?i).*(statement).*' for class 'BankStatement'\n", + "INFO:idp_common.classification.service:Classifying all pages as 'BankStatement' based on document name regex match. Skipping LLM classification.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "==================================================\n", + "DOCUMENT NAME REGEX CLASSIFICATION\n", + "==================================================\n", + "Regex Pattern: (?i).*(statement).*\n", + "Document ID: bank_statement\n", + "Direct Match: ✅ YES\n", + "\n", + "⚡ Results:\n", + "Processing time: 0.002 seconds\n", + "Status: QUEUED\n", + "Sections: 1\n", + "Token usage: 0 (no LLM calls)\n", + "Method: Regex-based classification\n" + ] + } + ], "source": [ "print(\"=\" * 50)\n", "print(\"DOCUMENT NAME REGEX CLASSIFICATION\")\n", @@ -119,24 +168,35 @@ "regex_config = deepcopy(BASE_CONFIG)\n", "regex_config['classes'] = [\n", " {\n", - " 'name': 'BankStatement',\n", + " '$schema': 'https://json-schema.org/draft/2020-12/schema',\n", + " '$id': 'BankStatement',\n", + " 'x-aws-idp-document-type': 'BankStatement',\n", + " 'type': 'object',\n", " 'description': 'Employee wage statement',\n", - " 'document_name_regex': r'(?i).*(statement).*',\n", - " 'attributes': [{'name': 'Name', 'description': 'Name', 'attributeType': 'simple'}]\n", + " 'x-aws-idp-document-name-regex': r'(?i).*(statement).*',\n", + " 'properties': {\n", + " 'Name': {\n", + " 'type': 'string',\n", + " 'description': 'Name'\n", + " }\n", + " }\n", " },\n", " {\n", - " 'name': 'Other',\n", + " '$schema': 'https://json-schema.org/draft/2020-12/schema',\n", + " '$id': 'Other',\n", + " 'x-aws-idp-document-type': 'Other',\n", + " 'type': 'object',\n", " 'description': 'Other documents',\n", - " 'attributes': []\n", + " 'properties': {}\n", " }\n", "]\n", "\n", "\n", "# Test regex pattern\n", - "pattern = re.compile(regex_config['classes'][0]['document_name_regex'])\n", + "pattern = re.compile(regex_config['classes'][0]['x-aws-idp-document-name-regex'])\n", "match = pattern.search(document.id)\n", "\n", - "print(f\"Regex Pattern: {regex_config['classes'][0]['document_name_regex']}\")\n", + "print(f\"Regex Pattern: {regex_config['classes'][0]['x-aws-idp-document-name-regex']}\")\n", "print(f\"Document ID: {document.id}\")\n", "print(f\"Direct Match: {'✅ YES' if match else '❌ NO'}\")\n", "\n", @@ -164,9 +224,87 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:idp_common.classification.service:Classification caching disabled\n", + "INFO:idp_common.classification.service:Initialized classification service with Bedrock backend using model us.amazon.nova-pro-v1:0\n", + "INFO:idp_common.classification.service:Using multimodal page-level classification method with document boundary detection\n", + "INFO:idp_common.classification.service:Classifying document with 6 pages using multimodal page-level classification with bedrock backend\n", + "INFO:idp_common.classification.service:Attempting to retrieve cached page classifications for document bank_statement\n", + "INFO:idp_common.classification.service:Found 0 cached page classifications, classifying 6 remaining pages\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "==================================================\n", + "PAGE CONTENT REGEX CLASSIFICATION\n", + "==================================================\n", + "Page Content Regex Patterns:\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "ERROR:idp_common.s3:Error reading text from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/1/result.json: Unable to locate credentials\n", + "ERROR:idp_common.s3:Error reading text from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/2/result.json: Unable to locate credentials\n", + "ERROR:idp_common.s3:Error reading text from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/6/result.json: Unable to locate credentials\n", + "ERROR:idp_common.s3:Error reading text from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/5/result.json: Unable to locate credentials\n", + "ERROR:idp_common.s3:Error reading text from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/3/result.json: Unable to locate credentials\n", + "WARNING:idp_common.classification.service:Failed to load text content from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/1/result.json: Unable to locate credentials\n", + "ERROR:idp_common.s3:Error reading text from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/4/result.json: Unable to locate credentials\n", + "WARNING:idp_common.classification.service:Failed to load text content from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/2/result.json: Unable to locate credentials\n", + "WARNING:idp_common.classification.service:Failed to load text content from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/6/result.json: Unable to locate credentials\n", + "WARNING:idp_common.classification.service:Failed to load text content from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/5/result.json: Unable to locate credentials\n", + "WARNING:idp_common.classification.service:Failed to load text content from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/3/result.json: Unable to locate credentials\n", + "ERROR:idp_common.s3:Error reading binary content from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/1/image.jpg: Unable to locate credentials\n", + "WARNING:idp_common.classification.service:Failed to load text content from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/4/result.json: Unable to locate credentials\n", + "ERROR:idp_common.s3:Error reading binary content from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/2/image.jpg: Unable to locate credentials\n", + "ERROR:idp_common.s3:Error reading binary content from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/6/image.jpg: Unable to locate credentials\n", + "ERROR:idp_common.s3:Error reading binary content from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/5/image.jpg: Unable to locate credentials\n", + "ERROR:idp_common.s3:Error reading binary content from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/3/image.jpg: Unable to locate credentials\n", + "WARNING:idp_common.classification.service:Failed to load image content from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/1/image.jpg: Unable to locate credentials\n", + "ERROR:idp_common.s3:Error reading binary content from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/4/image.jpg: Unable to locate credentials\n", + "WARNING:idp_common.classification.service:Failed to load image content from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/2/image.jpg: Unable to locate credentials\n", + "WARNING:idp_common.classification.service:Failed to load image content from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/6/image.jpg: Unable to locate credentials\n", + "WARNING:idp_common.classification.service:Failed to load image content from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/5/image.jpg: Unable to locate credentials\n", + "WARNING:idp_common.classification.service:Failed to load image content from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/3/image.jpg: Unable to locate credentials\n", + "WARNING:idp_common.classification.service:No content available for page 1\n", + "WARNING:idp_common.classification.service:Failed to load image content from s3://idp-modular-output-665340521033-us-east-1/modular-sample-2025-09-11_18-45-40.pdf/pages/4/image.jpg: Unable to locate credentials\n", + "WARNING:idp_common.classification.service:No content available for page 4\n", + "WARNING:idp_common.classification.service:No content available for page 6\n", + "WARNING:idp_common.classification.service:No content available for page 5\n", + "WARNING:idp_common.classification.service:No content available for page 3\n", + "WARNING:idp_common.classification.service:No content available for page 2\n", + "INFO:idp_common.classification.service:All pages succeeded for document bank_statement - skipping cache (no retry needed)\n", + "WARNING:idp_common.classification.service:Document classified with 6 errors\n", + "INFO:idp_common.classification.service:Document classified with 1 sections in 2.62 seconds\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "⚡ Results:\n", + "Processing time: 2.618 seconds\n", + "Status: QUEUED\n", + "Sections: 1\n", + "\n", + "📊 Method Breakdown:\n", + "Regex classified: 0\n", + "LLM classified: 6\n" + ] + } + ], "source": [ "print(\"\\n\" + \"=\" * 50)\n", "print(\"PAGE CONTENT REGEX CLASSIFICATION\")\n", @@ -176,21 +314,40 @@ "page_regex_config = deepcopy(BASE_CONFIG)\n", "page_regex_config['classes'] = [\n", " {\n", - " 'name': 'Payslip',\n", + " '$schema': 'https://json-schema.org/draft/2020-12/schema',\n", + " '$id': 'Payslip',\n", + " 'x-aws-idp-document-type': 'Payslip',\n", + " 'type': 'object',\n", " 'description': 'Employee wage statement',\n", - " 'document_page_content_regex': r'(?i)(gross\\s+pay|net\\s+pay|employee\\s+id)',\n", - " 'attributes': [{'name': 'EmployeeName', 'description': 'Name', 'attributeType': 'simple'}]\n", + " 'x-aws-idp-document-page-content-regex': r'(?i)(gross\\s+pay|net\\s+pay|employee\\s+id)',\n", + " 'properties': {\n", + " 'EmployeeName': {\n", + " 'type': 'string',\n", + " 'description': 'Name'\n", + " }\n", + " }\n", " },\n", " {\n", - " 'name': 'Invoice',\n", + " '$schema': 'https://json-schema.org/draft/2020-12/schema',\n", + " '$id': 'Invoice',\n", + " 'x-aws-idp-document-type': 'Invoice',\n", + " 'type': 'object',\n", " 'description': 'Business invoice',\n", - " 'document_page_content_regex': r'(?i)(invoice\\s+number|bill\\s+to|amount\\s+due)',\n", - " 'attributes': [{'name': 'InvoiceNumber', 'description': 'Number', 'attributeType': 'simple'}]\n", + " 'x-aws-idp-document-page-content-regex': r'(?i)(invoice\\s+number|bill\\s+to|amount\\s+due)',\n", + " 'properties': {\n", + " 'InvoiceNumber': {\n", + " 'type': 'string',\n", + " 'description': 'Number'\n", + " }\n", + " }\n", " },\n", " {\n", - " 'name': 'Other',\n", + " '$schema': 'https://json-schema.org/draft/2020-12/schema',\n", + " '$id': 'Other',\n", + " 'x-aws-idp-document-type': 'Other',\n", + " 'type': 'object',\n", " 'description': 'Other documents',\n", - " 'attributes': []\n", + " 'properties': {}\n", " }\n", "]\n", "\n", @@ -240,9 +397,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "==================================================\n", + "CONFIGURATION EXAMPLES\n", + "==================================================\n", + "Common Regex Patterns:\n", + "\n", + "Payslip:\n", + " Name: (?i).*(payslip|paystub|salary).*\n", + " Content: (?i)(gross\\s+pay|net\\s+pay|employee\\s+id)\n", + "\n", + "Invoice:\n", + " Name: (?i).*(invoice|bill|inv).*\n", + " Content: (?i)(invoice\\s+number|bill\\s+to|amount\\s+due)\n", + "\n", + "Bank Statement:\n", + " Name: (?i).*(statement|bank).*\n", + " Content: (?i)(account\\s+number|statement\\s+period)\n", + "\n", + "💡 Best Practices:\n", + "- Use (?i) for case-insensitive matching\n", + "- Use \\s+ for flexible whitespace\n", + "- Use | for multiple alternatives\n", + "- Test patterns with real documents\n", + "- Document name regex: single-class only\n", + "- Page content regex: multimodal page-level only\n" + ] + } + ], "source": [ "print(\"\\n\" + \"=\" * 50)\n", "print(\"CONFIGURATION EXAMPLES\")\n", @@ -288,9 +477,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "==================================================\n", + "✅ REGEX CLASSIFICATION COMPLETE\n", + "==================================================\n", + "\n", + "Key Benefits Demonstrated:\n", + "🚀 Massive performance improvement\n", + "💰 100% token usage reduction for matched patterns\n", + "🎯 Deterministic classification results\n", + "🔄 Seamless fallback to LLM when no match\n", + "⚙️ Simple configuration through regex patterns\n", + "\n", + "📌 Next step: Run extraction on the classified sections\n" + ] + } + ], "source": [ "print(\"\\n\" + \"=\" * 50)\n", "print(\"✅ REGEX CLASSIFICATION COMPLETE\")\n", @@ -303,11 +512,19 @@ "print(\"⚙️ Simple configuration through regex patterns\")\n", "print(\"\\n📌 Next step: Run extraction on the classified sections\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b20424e4", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "accelerated-intelligent-document-processing-on-aws", "language": "python", "name": "python3" }, diff --git a/notebooks/usecase-specific-examples/multi-page-bank-statement/config/classes.yaml b/notebooks/usecase-specific-examples/multi-page-bank-statement/config/classes.yaml index 7e32e6833..04fe1a456 100644 --- a/notebooks/usecase-specific-examples/multi-page-bank-statement/config/classes.yaml +++ b/notebooks/usecase-specific-examples/multi-page-bank-statement/config/classes.yaml @@ -1,55 +1,84 @@ # Document Classes and Attributes Configuration classes: - - name: Bank Statement - description: Monthly bank account statement - attributes: - - name: Account Number - description: Primary account identifier - attributeType: simple - evaluation_method: EXACT - - name: Statement Period - description: Statement period (e.g., January 2024) - evaluation_threshold: '0.8' - attributeType: simple - evaluation_method: FUZZY - - name: Account Holder Address + - $schema: https://json-schema.org/draft/2020-12/schema + $defs: + Transaction: + type: object + properties: + Date: + format: date + description: Transaction date (MM/DD/YYYY) + x-aws-idp-confidence-threshold: "0.9" + type: string + x-aws-idp-evaluation-method: FUZZY + Description: + description: Transaction description or merchant name + x-aws-idp-confidence-threshold: "0.7" + type: string + x-aws-idp-evaluation-method: SEMANTIC + Amount: + type: number + description: >- + Transaction amount (positive for deposits, negative for + withdrawals) + x-aws-idp-evaluation-method: NUMERIC_EXACT + required: + - Date + - Description + - Amount + Account Holder Address: description: Complete address information for the account holder - attributeType: group - groupAttributes: - - name: Street Number - description: House or building number - evaluation_threshold: '0.9' - evaluation_method: FUZZY - - name: Street Name - description: Name of the street - evaluation_threshold: '0.8' - evaluation_method: FUZZY - - name: City + type: object + properties: + City: description: City name - evaluation_threshold: '0.9' - evaluation_method: FUZZY - - name: State - description: State abbreviation (e.g., CA, NY) - evaluation_method: EXACT - - name: ZIP Code + x-aws-idp-confidence-threshold: "0.9" + type: string + x-aws-idp-evaluation-method: FUZZY + ZIP Code: + pattern: \d{5,9} description: 5 or 9 digit postal code - evaluation_method: EXACT - - name: Transactions - listItemTemplate: - itemAttributes: - - name: Date - description: Transaction date (MM/DD/YYYY) - evaluation_threshold: '0.9' - evaluation_method: FUZZY - - name: Description - description: Transaction description or merchant name - evaluation_threshold: '0.7' - evaluation_method: SEMANTIC - - name: Amount - description: >- - Transaction amount (positive for deposits, negative for - withdrawals) - evaluation_method: NUMERIC_EXACT - itemDescription: Individual transaction record + type: string + x-aws-idp-evaluation-method: EXACT + Street Name: + description: Name of the street + x-aws-idp-confidence-threshold: "0.8" + type: string + x-aws-idp-evaluation-method: FUZZY + Street Number: + description: House or building number + x-aws-idp-confidence-threshold: "0.9" + type: string + x-aws-idp-evaluation-method: FUZZY + State: + type: string + description: State abbreviation (e.g., CA, NY) + x-aws-idp-evaluation-method: EXACT + required: + - Street Name + - City + description: Monthly bank account statement + type: object + x-aws-idp-document-type: Bank Statement + properties: + Account Holder Address: + description: Complete address information for the account holder + $ref: "#/$defs/Account Holder Address" + Transactions: description: List of all transactions in the statement period - attributeType: list \ No newline at end of file + type: array + x-aws-idp-list-item-description: Individual transaction record + items: + $ref: "#/$defs/Transaction" + Account Number: + type: string + description: Primary account identifier + x-aws-idp-evaluation-method: EXACT + Statement Period: + type: string + description: Statement period (e.g., January 2024) + x-aws-idp-evaluation-method: FUZZY + required: + - Account Number + $id: Bank Statement + diff --git a/notebooks/usecase-specific-examples/multi-page-bank-statement/step4_assessment_granular.ipynb b/notebooks/usecase-specific-examples/multi-page-bank-statement/step4_assessment_granular.ipynb index c7e5411bf..fe8c07dac 100644 --- a/notebooks/usecase-specific-examples/multi-page-bank-statement/step4_assessment_granular.ipynb +++ b/notebooks/usecase-specific-examples/multi-page-bank-statement/step4_assessment_granular.ipynb @@ -153,28 +153,39 @@ "outputs": [], "source": [ "# Display document classes with confidence thresholds\n", - "classes = CONFIG.get('classes', [])\n", - "print(f\"\\nDocument Classes with Confidence Thresholds:\")\n", - "for cls in classes:\n", - " print(f\"\\n{cls['name']}:\")\n", - " for attr in cls.get('attributes', [])[:5]: # Show first 5 attributes\n", - " threshold = attr.get('confidence_threshold', 'default')\n", - " attr_type = attr.get('attributeType', 'simple')\n", - " print(f\" - {attr['name']} ({attr_type}): threshold = {threshold}\")\n", - " \n", - " # Show nested attributes for groups and lists\n", - " if attr_type == 'group':\n", - " for group_attr in attr.get('groupAttributes', [])[:3]:\n", - " group_threshold = group_attr.get('confidence_threshold', 'default')\n", - " print(f\" • {group_attr['name']}: {group_threshold}\")\n", - " elif attr_type == 'list':\n", - " list_template = attr.get('listItemTemplate', {})\n", - " for item_attr in list_template.get('itemAttributes', [])[:3]:\n", - " item_threshold = item_attr.get('confidence_threshold', 'default')\n", - " print(f\" • {item_attr['name']}: {item_threshold}\")\n", - " \n", - " if len(cls.get('attributes', [])) > 5:\n", - " print(f\" ... and {len(cls.get('attributes', [])) - 5} more\")" + "classes = CONFIG.get('classes', [])\n", + "print(f\"\\nDocument Classes with Confidence Thresholds:\")\n", + "for cls in classes:\n", + " # Get class name from JSON Schema format or legacy format\n", + " class_name = cls.get('x-aws-idp-document-type') or cls.get('$id') or cls.get('name', 'Unknown')\n", + " print(f\"\\n{class_name}:\")\n", + " \n", + " # JSON Schema format uses 'properties' instead of 'attributes'\n", + " properties = cls.get('properties', {})\n", + " if not properties:\n", + " # Fallback to legacy format for backwards compatibility\n", + " properties = {attr['name']: attr for attr in cls.get('attributes', [])}\n", + " \n", + " # Show first 5 properties\n", + " for idx, (attr_name, attr_schema) in enumerate(list(properties.items())[:5]):\n", + " threshold = attr_schema.get('x-aws-idp-confidence-threshold', 'default')\n", + " attr_type = attr_schema.get('type', 'string')\n", + " print(f\" - {attr_name} ({attr_type}): threshold = {threshold}\")\n", + " \n", + " # Show nested properties for objects and arrays\n", + " if attr_type == 'object' and 'properties' in attr_schema:\n", + " for group_attr_name, group_attr in list(attr_schema['properties'].items())[:3]:\n", + " group_threshold = group_attr.get('x-aws-idp-confidence-threshold', 'default')\n", + " print(f\" • {group_attr_name}: {group_threshold}\")\n", + " elif attr_type == 'array' and 'items' in attr_schema:\n", + " item_schema = attr_schema['items']\n", + " if item_schema.get('type') == 'object' and 'properties' in item_schema:\n", + " for item_attr_name, item_attr in list(item_schema['properties'].items())[:3]:\n", + " item_threshold = item_attr.get('x-aws-idp-confidence-threshold', 'default')\n", + " print(f\" • {item_attr_name}: {item_threshold}\")\n", + " \n", + " if len(properties) > 5:\n", + " print(f\" ... and {len(properties) - 5} more\")" ] }, { diff --git a/src/ui/src/components/json-schema-builder/SchemaInspector.jsx b/src/ui/src/components/json-schema-builder/SchemaInspector.jsx index 3e2cb4367..8d18c2b23 100644 --- a/src/ui/src/components/json-schema-builder/SchemaInspector.jsx +++ b/src/ui/src/components/json-schema-builder/SchemaInspector.jsx @@ -15,6 +15,8 @@ import { X_AWS_IDP_EVALUATION_METHOD, X_AWS_IDP_CONFIDENCE_THRESHOLD, X_AWS_IDP_EXAMPLES, + X_AWS_IDP_DOCUMENT_NAME_REGEX, + X_AWS_IDP_PAGE_CONTENT_REGEX, } from '../../constants/schemaConstants'; const SchemaInspector = ({ @@ -99,10 +101,34 @@ const SchemaInspector = ({ {selectedClass[X_AWS_IDP_DOCUMENT_TYPE] && ( - onUpdateClass({ [X_AWS_IDP_EXAMPLES]: examples })} - /> + <> + onUpdateClass({ [X_AWS_IDP_EXAMPLES]: examples })} + /> + + + onUpdateClass({ [X_AWS_IDP_DOCUMENT_NAME_REGEX]: detail.value || undefined })} + placeholder="e.g., (?i).*(invoice|bill).*" + /> + + + + onUpdateClass({ [X_AWS_IDP_PAGE_CONTENT_REGEX]: detail.value || undefined })} + placeholder="e.g., (?i)(invoice\\s+number|bill\\s+to)" + /> + + )} {usedIn.length > 0 && ( diff --git a/src/ui/src/constants/schemaConstants.js b/src/ui/src/constants/schemaConstants.js index d3ef8db68..409c488ff 100644 --- a/src/ui/src/constants/schemaConstants.js +++ b/src/ui/src/constants/schemaConstants.js @@ -67,6 +67,10 @@ export const X_AWS_IDP_DOCUMENT_TYPE = 'x-aws-idp-document-type'; /** Classification metadata for document type */ export const X_AWS_IDP_CLASSIFICATION = 'x-aws-idp-classification'; +/** Regex patterns for classification optimization */ +export const X_AWS_IDP_DOCUMENT_NAME_REGEX = 'x-aws-idp-document-name-regex'; +export const X_AWS_IDP_PAGE_CONTENT_REGEX = 'x-aws-idp-document-page-content-regex'; + // ============================================================================ // AWS IDP List-Specific Extensions // ============================================================================ diff --git a/src/ui/src/hooks/useSchemaDesigner.js b/src/ui/src/hooks/useSchemaDesigner.js index cb854697d..c7554a342 100644 --- a/src/ui/src/hooks/useSchemaDesigner.js +++ b/src/ui/src/hooks/useSchemaDesigner.js @@ -127,6 +127,8 @@ const convertJsonSchemaToClasses = (jsonSchema) => { properties: extractedProperties, required: schema.required || [], }, + // Preserve examples if they exist in the schema + ...(schema[X_AWS_IDP_EXAMPLES] ? { [X_AWS_IDP_EXAMPLES]: schema[X_AWS_IDP_EXAMPLES] } : {}), }; allClasses.push(docTypeClass); @@ -188,6 +190,8 @@ const convertJsonSchemaToClasses = (jsonSchema) => { properties: extractedProperties, required: jsonSchema.required || [], }, + // Preserve examples if they exist in the schema + ...(jsonSchema[X_AWS_IDP_EXAMPLES] ? { [X_AWS_IDP_EXAMPLES]: jsonSchema[X_AWS_IDP_EXAMPLES] } : {}), }; classes.push(mainClass);