diff --git a/CHANGELOG.md b/CHANGELOG.md index 210d3056d..0631e932a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,7 +38,7 @@ SPDX-License-Identifier: MIT-0 ### Fixed -- **Fillable PDF form fields missing from rendered page images** — Fixed bug where fillable PDF form fields (text inputs, checkboxes, radio buttons, dropdowns) were not rendered in page images, causing OCR and extraction to miss user-entered data. Root cause: pypdfium2's `render(may_draw_forms=True)` requires `PdfDocument.init_forms()` to be called first to initialize the form rendering engine. Added `init_forms()` call in both Pattern 2 (`OcrService`) and Pattern 1 (`create_pdf_page_images`) PDF rendering pipelines. ([#240](https://github.com/aws-solutions-library-samples/accelerated-intelligent-document-processing-on-aws/issues/240)) +- **Fillable PDF form fields missing from rendered page images** — Fixed bug where fillable PDF form fields (text inputs, checkboxes, radio buttons, dropdowns) were not rendered in page images, causing OCR and extraction to miss user-entered data. Two-part fix: (1) `PdfDocument.init_forms()` initializes the form rendering engine so PDFium can process form fields, and (2) `page.flatten()` merges form field appearances into page content before rendering — required because many fillable PDFs (especially government forms) lack pre-generated appearance streams. Applied in both Pattern 2 (`OcrService`) and Pattern 1 (`create_pdf_page_images`) PDF rendering pipelines. ([#240](https://github.com/aws-solutions-library-samples/accelerated-intelligent-document-processing-on-aws/issues/240)) - **Discovery subscription handler dropping errorMessage and other fields** — Fixed bug where the UI subscription handler did `{ ...oldJob, status: updatedJob.status }`, discarding all fields except status from real-time subscription updates. Error messages, discovered class names, and status messages were being sent by the backend but silently dropped by the UI. Now spreads all fields: `{ ...oldJob, ...updatedJob }`. diff --git a/lib/idp_common_pkg/idp_common/ocr/service.py b/lib/idp_common_pkg/idp_common/ocr/service.py index fe999de5b..62e907bf0 100644 --- a/lib/idp_common_pkg/idp_common/ocr/service.py +++ b/lib/idp_common_pkg/idp_common/ocr/service.py @@ -426,6 +426,12 @@ def process_document(self, document: Document) -> Document: page_images: Dict[int, bytes] = {} for i in pages_to_render: page = pdf_document[i] + # Flatten form fields into page content before rendering. + # Many fillable PDFs (e.g., government forms) lack appearance + # streams for form fields — flatten() forces PDFium to generate + # them and merge into page content so render() can display them. + # Requires init_forms() to have been called before page retrieval. + page.flatten() page_images[i] = self._extract_page_image(page, True, i + 1) pdf_document.close() diff --git a/lib/idp_common_pkg/tests/unit/ocr/test_ocr_service.py b/lib/idp_common_pkg/tests/unit/ocr/test_ocr_service.py index 658c75af1..95733997f 100644 --- a/lib/idp_common_pkg/tests/unit/ocr/test_ocr_service.py +++ b/lib/idp_common_pkg/tests/unit/ocr/test_ocr_service.py @@ -353,6 +353,11 @@ def test_process_document_calls_init_forms_for_fillable_pdfs( # Verify init_forms() was called to enable fillable PDF form rendering mock_pdf_doc.init_forms.assert_called_once() + # Verify flatten() was called on the page to merge form fields + # into page content (needed for PDFs without appearance streams) + mock_page = mock_pdf_doc.__getitem__.return_value + mock_page.flatten.assert_called_once() + @patch("boto3.client") @patch("idp_common.ocr.service.pdfium.PdfDocument") def test_process_document_success( diff --git a/patterns/unified/src/bda_processresults_function/index.py b/patterns/unified/src/bda_processresults_function/index.py index 4b5645430..32e18a4e5 100644 --- a/patterns/unified/src/bda_processresults_function/index.py +++ b/patterns/unified/src/bda_processresults_function/index.py @@ -165,6 +165,11 @@ def create_pdf_page_images(bda_result_bucket, output_bucket, object_key): for page_num in range(len(pdf_document)): # Render page to a PIL image page = pdf_document[page_num] + # Flatten form fields into page content before rendering. + # Many fillable PDFs (e.g., government forms) lack appearance + # streams for form fields — flatten() forces PDFium to generate + # them and merge into page content so render() can display them. + page.flatten() pil_img = page.render(scale=150 / 72).to_pil() # Save the image to a BytesIO object as JPEG