55
66import formalpdf
77import logging
8+ import pypdfium2 .raw as pdfium_c
9+ import ctypes
810
911
1012logging .getLogger ("pypdfium2" ).setLevel (logging .ERROR )
@@ -38,7 +40,7 @@ def process_pdf(pdf_path, output_dir):
3840 image = pdfium_page .render (scale = scale , may_draw_forms = False ).to_pil ()
3941 widgets = page .widgets ()
4042
41- image_filename = f"{ pdf_name } -{ page_idx } .png "
43+ image_filename = f"{ pdf_name } -{ page_idx } .jpg "
4244
4345 # Create image info
4446 image_info = {
@@ -48,19 +50,42 @@ def process_pdf(pdf_path, output_dir):
4850 }
4951
5052 # Save image
51- image .save (images_dir / image_filename )
53+ image .save (images_dir / image_filename , format = "JPEG" )
5254
5355 # Process annotations
5456 annotations = []
5557 for widget in widgets :
56- # convert bounding box in pt to pixels
57- top = widget .rect .top * scale
58- left = widget .rect .left * scale
59- bottom = widget .rect .bottom * scale
60- right = widget .rect .right * scale
61-
62- y0 = image .height - top
63- y1 = image .height - bottom
58+ # Use pypdfium2's page-to-device coordinate transformation
59+ # to properly convert PDF coordinates to image pixel coordinates
60+ page_x1 , page_y1 = widget .rect .left , widget .rect .bottom
61+ page_x2 , page_y2 = widget .rect .right , widget .rect .top
62+
63+ # Convert page coordinates to device coordinates
64+ # using pypdfium2's FPDF_PageToDevice function
65+ dev_x1 = ctypes .c_int ()
66+ dev_y1 = ctypes .c_int ()
67+ dev_x2 = ctypes .c_int ()
68+ dev_y2 = ctypes .c_int ()
69+
70+ # FPDF_PageToDevice(page, start_x, start_y, size_x, size_y, rotate, page_x, page_y, device_x, device_y)
71+ pdfium_c .FPDF_PageToDevice (
72+ pdfium_page .raw , 0 , 0 , image .width , image .height , 0 ,
73+ page_x1 , page_y1 , ctypes .byref (dev_x1 ), ctypes .byref (dev_y1 )
74+ )
75+ pdfium_c .FPDF_PageToDevice (
76+ pdfium_page .raw , 0 , 0 , image .width , image .height , 0 ,
77+ page_x2 , page_y2 , ctypes .byref (dev_x2 ), ctypes .byref (dev_y2 )
78+ )
79+
80+ # Convert to Python ints
81+ left = float (dev_x1 .value )
82+ bottom = float (dev_y1 .value )
83+ right = float (dev_x2 .value )
84+ top = float (dev_y2 .value )
85+
86+ # Device coordinates have top-left origin, so y values are already correct
87+ y0 = min (top , bottom )
88+ y1 = max (top , bottom )
6489
6590 # try for the category, otherwise "Text"
6691 categories = { "Text" : 0 ,
@@ -102,11 +127,13 @@ def process_pdf(pdf_path, output_dir):
102127
103128 total_widgets += len (widgets )
104129
105- document .document .close ()
106130 return f"Processed { pdf_name } : { num_pages } pages, { total_widgets } widgets"
107131
108132 except Exception as e :
109133 return f"Error processing { pdf_name } : { str (e )} "
134+
135+ finally :
136+ document .document .close ()
110137
111138
112139def main ():
0 commit comments